summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/procfs-diskstats10
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst92
-rw-r--r--Documentation/block/null_blk.txt7
-rw-r--r--Documentation/block/stat.txt28
-rw-r--r--Documentation/iostats.txt15
-rw-r--r--block/Kconfig16
-rw-r--r--block/Makefile4
-rw-r--r--block/bfq-iosched.c131
-rw-r--r--block/bfq-iosched.h7
-rw-r--r--block/bfq-wf2q.c30
-rw-r--r--block/bio-integrity.c22
-rw-r--r--block/bio.c208
-rw-r--r--block/blk-cgroup.c284
-rw-r--r--block/blk-core.c106
-rw-r--r--block/blk-ioc.c2
-rw-r--r--block/blk-iolatency.c955
-rw-r--r--block/blk-lib.c10
-rw-r--r--block/blk-mq-debugfs-zoned.c24
-rw-r--r--block/blk-mq-debugfs.c24
-rw-r--r--block/blk-mq-debugfs.h9
-rw-r--r--block/blk-mq-pci.c5
-rw-r--r--block/blk-mq-sched.c112
-rw-r--r--block/blk-mq-tag.c11
-rw-r--r--block/blk-mq.c173
-rw-r--r--block/blk-mq.h13
-rw-r--r--block/blk-rq-qos.c194
-rw-r--r--block/blk-rq-qos.h109
-rw-r--r--block/blk-settings.c6
-rw-r--r--block/blk-stat.c16
-rw-r--r--block/blk-stat.h4
-rw-r--r--block/blk-sysfs.c37
-rw-r--r--block/blk-throttle.c32
-rw-r--r--block/blk-wbt.c425
-rw-r--r--block/blk-wbt.h68
-rw-r--r--block/blk-zoned.c2
-rw-r--r--block/blk.h7
-rw-r--r--block/bounce.c69
-rw-r--r--block/bsg-lib.c5
-rw-r--r--block/bsg.c460
-rw-r--r--block/cfq-iosched.c23
-rw-r--r--block/genhd.c29
-rw-r--r--block/partition-generic.c25
-rw-r--r--block/partitions/aix.c13
-rw-r--r--block/partitions/ldm.c3
-rw-r--r--block/t10-pi.c110
-rw-r--r--drivers/Makefile2
-rw-r--r--drivers/ata/libata-scsi.c18
-rw-r--r--drivers/block/DAC960.c9
-rw-r--r--drivers/block/Kconfig2
-rw-r--r--drivers/block/Makefile5
-rw-r--r--drivers/block/aoe/aoecmd.c1
-rw-r--r--drivers/block/aoe/aoedev.c4
-rw-r--r--drivers/block/brd.c14
-rw-r--r--drivers/block/drbd/drbd_int.h2
-rw-r--r--drivers/block/drbd/drbd_main.c12
-rw-r--r--drivers/block/drbd/drbd_receiver.c6
-rw-r--r--drivers/block/drbd/drbd_req.c4
-rw-r--r--drivers/block/drbd/drbd_worker.c4
-rw-r--r--drivers/block/floppy.c3
-rw-r--r--drivers/block/loop.c3
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c3
-rw-r--r--drivers/block/null_blk.h108
-rw-r--r--drivers/block/null_blk_main.c (renamed from drivers/block/null_blk.c)129
-rw-r--r--drivers/block/null_blk_zoned.c149
-rw-r--r--drivers/block/paride/bpck.c3
-rw-r--r--drivers/block/paride/pd.c2
-rw-r--r--drivers/block/pktcdvd.c109
-rw-r--r--drivers/block/rsxx/dev.c6
-rw-r--r--drivers/block/skd_main.c16
-rw-r--r--drivers/block/xen-blkfront.c9
-rw-r--r--drivers/block/zram/zram_drv.c19
-rw-r--r--drivers/cdrom/cdrom.c30
-rw-r--r--drivers/ide/ide-cd.c58
-rw-r--r--drivers/ide/ide-cd.h6
-rw-r--r--drivers/ide/ide-cd_ioctl.c62
-rw-r--r--drivers/infiniband/ulp/iser/iser_memory.c2
-rw-r--r--drivers/lightnvm/Kconfig30
-rw-r--r--drivers/lightnvm/pblk-cache.c9
-rw-r--r--drivers/lightnvm/pblk-core.c78
-rw-r--r--drivers/lightnvm/pblk-gc.c34
-rw-r--r--drivers/lightnvm/pblk-init.c98
-rw-r--r--drivers/lightnvm/pblk-rb.c24
-rw-r--r--drivers/lightnvm/pblk-read.c247
-rw-r--r--drivers/lightnvm/pblk-recovery.c47
-rw-r--r--drivers/lightnvm/pblk-sysfs.c13
-rw-r--r--drivers/lightnvm/pblk-write.c35
-rw-r--r--drivers/lightnvm/pblk.h48
-rw-r--r--drivers/md/bcache/bcache.h24
-rw-r--r--drivers/md/bcache/bset.c63
-rw-r--r--drivers/md/bcache/btree.c63
-rw-r--r--drivers/md/bcache/btree.h2
-rw-r--r--drivers/md/bcache/closure.c13
-rw-r--r--drivers/md/bcache/closure.h4
-rw-r--r--drivers/md/bcache/debug.c17
-rw-r--r--drivers/md/bcache/journal.c1
-rw-r--r--drivers/md/bcache/request.c75
-rw-r--r--drivers/md/bcache/super.c59
-rw-r--r--drivers/md/bcache/sysfs.c48
-rw-r--r--drivers/md/bcache/util.c2
-rw-r--r--drivers/md/bcache/util.h2
-rw-r--r--drivers/md/bcache/writeback.c125
-rw-r--r--drivers/md/bcache/writeback.h19
-rw-r--r--drivers/md/dm.c6
-rw-r--r--drivers/md/md.c12
-rw-r--r--drivers/nvdimm/btt.c12
-rw-r--r--drivers/nvdimm/nd.h7
-rw-r--r--drivers/nvdimm/pmem.c13
-rw-r--r--drivers/nvme/host/core.c108
-rw-r--r--drivers/nvme/host/fabrics.c2
-rw-r--r--drivers/nvme/host/fc.c1
-rw-r--r--drivers/nvme/host/lightnvm.c27
-rw-r--r--drivers/nvme/host/multipath.c349
-rw-r--r--drivers/nvme/host/nvme.h78
-rw-r--r--drivers/nvme/host/pci.c77
-rw-r--r--drivers/nvme/host/rdma.c234
-rw-r--r--drivers/nvme/host/trace.c11
-rw-r--r--drivers/nvme/host/trace.h142
-rw-r--r--drivers/nvme/target/admin-cmd.c221
-rw-r--r--drivers/nvme/target/configfs.c250
-rw-r--r--drivers/nvme/target/core.c104
-rw-r--r--drivers/nvme/target/discovery.c2
-rw-r--r--drivers/nvme/target/io-cmd-bdev.c7
-rw-r--r--drivers/nvme/target/io-cmd-file.c80
-rw-r--r--drivers/nvme/target/loop.c1
-rw-r--r--drivers/nvme/target/nvmet.h62
-rw-r--r--drivers/nvme/target/rdma.c197
-rw-r--r--drivers/scsi/Makefile2
-rw-r--r--drivers/scsi/cxlflash/superpipe.c8
-rw-r--r--drivers/scsi/cxlflash/vlun.c7
-rw-r--r--drivers/scsi/mpt3sas/mpt3sas_scsih.c2
-rw-r--r--drivers/scsi/scsi_lib.c6
-rw-r--r--drivers/scsi/sd.c8
-rw-r--r--drivers/scsi/sd.h9
-rw-r--r--drivers/scsi/sd_dif.c113
-rw-r--r--drivers/scsi/sr_ioctl.c22
-rw-r--r--drivers/scsi/virtio_scsi.c8
-rw-r--r--drivers/target/Kconfig5
-rw-r--r--drivers/target/loopback/Kconfig1
-rw-r--r--fs/block_dev.c6
-rw-r--r--fs/exofs/ore.c4
-rw-r--r--fs/ext4/super.c5
-rw-r--r--fs/ext4/sysfs.c6
-rw-r--r--fs/f2fs/f2fs.h2
-rw-r--r--fs/f2fs/super.c3
-rw-r--r--fs/mpage.c4
-rw-r--r--include/linux/bio.h19
-rw-r--r--include/linux/blk-cgroup.h146
-rw-r--r--include/linux/blk-mq.h4
-rw-r--r--include/linux/blk_types.h27
-rw-r--r--include/linux/blkdev.h66
-rw-r--r--include/linux/cdrom.h3
-rw-r--r--include/linux/cgroup-defs.h3
-rw-r--r--include/linux/genhd.h14
-rw-r--r--include/linux/memcontrol.h13
-rw-r--r--include/linux/nvme.h72
-rw-r--r--include/linux/sched.h8
-rw-r--r--include/linux/swap.h11
-rw-r--r--include/linux/t10-pi.h24
-rw-r--r--include/linux/tracehook.h2
-rw-r--r--include/scsi/scsi_cmnd.h13
-rw-r--r--include/scsi/scsi_device.h14
-rw-r--r--include/uapi/linux/bcache.h4
-rw-r--r--include/uapi/linux/blkzoned.h2
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/trace/blktrace.c6
-rw-r--r--mm/huge_memory.c6
-rw-r--r--mm/memcontrol.c13
-rw-r--r--mm/memory.c11
-rw-r--r--mm/page_io.c3
-rw-r--r--mm/readahead.c19
-rw-r--r--mm/shmem.c10
-rw-r--r--mm/swapfile.c31
172 files changed, 6029 insertions, 2659 deletions
diff --git a/Documentation/ABI/testing/procfs-diskstats b/Documentation/ABI/testing/procfs-diskstats
index f91a973a37fe..abac31d216de 100644
--- a/Documentation/ABI/testing/procfs-diskstats
+++ b/Documentation/ABI/testing/procfs-diskstats
@@ -5,6 +5,7 @@ Description:
The /proc/diskstats file displays the I/O statistics
of block devices. Each line contains the following 14
fields:
+
1 - major number
2 - minor mumber
3 - device name
@@ -19,4 +20,13 @@ Description:
12 - I/Os currently in progress
13 - time spent doing I/Os (ms)
14 - weighted time spent doing I/Os (ms)
+
+ Kernel 4.18+ appends four more fields for discard
+ tracking putting the total at 18:
+
+ 15 - discards completed successfully
+ 16 - discards merged
+ 17 - sectors discarded
+ 18 - time spent discarding
+
For more details refer to Documentation/iostats.txt
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 8a2c52d5c53b..1746131bc9cb 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -51,6 +51,9 @@ v1 is available under Documentation/cgroup-v1/.
5-3. IO
5-3-1. IO Interface Files
5-3-2. Writeback
+ 5-3-3. IO Latency
+ 5-3-3-1. How IO Latency Throttling Works
+ 5-3-3-2. IO Latency Interface Files
5-4. PID
5-4-1. PID Interface Files
5-5. Device
@@ -1314,17 +1317,19 @@ IO Interface Files
Lines are keyed by $MAJ:$MIN device numbers and not ordered.
The following nested keys are defined.
- ====== ===================
+ ====== =====================
rbytes Bytes read
wbytes Bytes written
rios Number of read IOs
wios Number of write IOs
- ====== ===================
+ dbytes Bytes discarded
+ dios Number of discard IOs
+ ====== =====================
An example read output follows:
- 8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353
- 8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252
+ 8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 dbytes=0 dios=0
+ 8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 dbytes=50331648 dios=3021
io.weight
A read-write flat-keyed file which exists on non-root cgroups.
@@ -1446,6 +1451,85 @@ writeback as follows.
vm.dirty[_background]_ratio.
+IO Latency
+~~~~~~~~~~
+
+This is a cgroup v2 controller for IO workload protection. You provide a group
+with a latency target, and if the average latency exceeds that target the
+controller will throttle any peers that have a lower latency target than the
+protected workload.
+
+The limits are only applied at the peer level in the hierarchy. This means that
+in the diagram below, only groups A, B, and C will influence each other, and
+groups D and F will influence each other. Group G will influence nobody.
+
+ [root]
+ / | \
+ A B C
+ / \ |
+ D F G
+
+
+So the ideal way to configure this is to set io.latency in groups A, B, and C.
+Generally you do not want to set a value lower than the latency your device
+supports. Experiment to find the value that works best for your workload.
+Start at higher than the expected latency for your device and watch the
+avg_lat value in io.stat for your workload group to get an idea of the
+latency you see during normal operation. Use the avg_lat value as a basis for
+your real setting, setting at 10-15% higher than the value in io.stat.
+
+How IO Latency Throttling Works
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+io.latency is work conserving; so as long as everybody is meeting their latency
+target the controller doesn't do anything. Once a group starts missing its
+target it begins throttling any peer group that has a higher target than itself.
+This throttling takes 2 forms:
+
+- Queue depth throttling. This is the number of outstanding IO's a group is
+ allowed to have. We will clamp down relatively quickly, starting at no limit
+ and going all the way down to 1 IO at a time.
+
+- Artificial delay induction. There are certain types of IO that cannot be
+ throttled without possibly adversely affecting higher priority groups. This
+ includes swapping and metadata IO. These types of IO are allowed to occur
+ normally, however they are "charged" to the originating group. If the
+ originating group is being throttled you will see the use_delay and delay
+ fields in io.stat increase. The delay value is how many microseconds that are
+ being added to any process that runs in this group. Because this number can
+ grow quite large if there is a lot of swapping or metadata IO occurring we
+ limit the individual delay events to 1 second at a time.
+
+Once the victimized group starts meeting its latency target again it will start
+unthrottling any peer groups that were throttled previously. If the victimized
+group simply stops doing IO the global counter will unthrottle appropriately.
+
+IO Latency Interface Files
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ io.latency
+ This takes a similar format as the other controllers.
+
+ "MAJOR:MINOR target=<target time in microseconds"
+
+ io.stat
+ If the controller is enabled you will see extra stats in io.stat in
+ addition to the normal ones.
+
+ depth
+ This is the current queue depth for the group.
+
+ avg_lat
+ This is an exponential moving average with a decay rate of 1/exp
+ bound by the sampling interval. The decay rate interval can be
+ calculated by multiplying the win value in io.stat by the
+ corresponding number of samples based on the win value.
+
+ win
+ The sampling window size in milliseconds. This is the minimum
+ duration of time between evaluation events. Windows only elapse
+ with IO activity. Idle periods extend the most recent window.
+
PID
---
diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt
index 07f147381f32..ea2dafe49ae8 100644
--- a/Documentation/block/null_blk.txt
+++ b/Documentation/block/null_blk.txt
@@ -85,3 +85,10 @@ shared_tags=[0/1]: Default: 0
0: Tag set is not shared.
1: Tag set shared between devices for blk-mq. Only makes sense with
nr_devices > 1, otherwise there's no tag set to share.
+
+zoned=[0/1]: Default: 0
+ 0: Block device is exposed as a random-access block device.
+ 1: Block device is exposed as a host-managed zoned block device.
+
+zone_size=[MB]: Default: 256
+ Per zone size when exposed as a zoned block device. Must be a power of two.
diff --git a/Documentation/block/stat.txt b/Documentation/block/stat.txt
index 0dbc946de2ea..0aace9cc536c 100644
--- a/Documentation/block/stat.txt
+++ b/Documentation/block/stat.txt
@@ -31,28 +31,32 @@ write ticks milliseconds total wait time for write requests
in_flight requests number of I/Os currently in flight
io_ticks milliseconds total time this block device has been active
time_in_queue milliseconds total wait time for all requests
+discard I/Os requests number of discard I/Os processed
+discard merges requests number of discard I/Os merged with in-queue I/O
+discard sectors sectors number of sectors discarded
+discard ticks milliseconds total wait time for discard requests
-read I/Os, write I/Os
-=====================
+read I/Os, write I/Os, discard I/0s
+===================================
These values increment when an I/O request completes.
-read merges, write merges
-=========================
+read merges, write merges, discard merges
+=========================================
These values increment when an I/O request is merged with an
already-queued I/O request.
-read sectors, write sectors
-===========================
+read sectors, write sectors, discard_sectors
+============================================
-These values count the number of sectors read from or written to this
-block device. The "sectors" in question are the standard UNIX 512-byte
-sectors, not any device- or filesystem-specific block size. The
-counters are incremented when the I/O completes.
+These values count the number of sectors read from, written to, or
+discarded from this block device. The "sectors" in question are the
+standard UNIX 512-byte sectors, not any device- or filesystem-specific
+block size. The counters are incremented when the I/O completes.
-read ticks, write ticks
-=======================
+read ticks, write ticks, discard ticks
+======================================
These values count the number of milliseconds that I/O requests have
waited on this block device. If there are multiple I/O requests waiting,
diff --git a/Documentation/iostats.txt b/Documentation/iostats.txt
index 04d394a2e06c..49df45f90e8a 100644
--- a/Documentation/iostats.txt
+++ b/Documentation/iostats.txt
@@ -31,6 +31,9 @@ Here are examples of these different formats::
3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
3 1 hda1 35486 38030 38030 38030
+ 4.18+ diskstats:
+ 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0
+
On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have
a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``.
@@ -101,6 +104,18 @@ Field 11 -- weighted # of milliseconds spent doing I/Os
last update of this field. This can provide an easy measure of both
I/O completion time and the backlog that may be accumulating.
+Field 12 -- # of discards completed
+ This is the total number of discards completed successfully.
+
+Field 13 -- # of discards merged
+ See the description of field 2
+
+Field 14 -- # of sectors discarded
+ This is the total number of sectors discarded successfully.
+
+Field 15 -- # of milliseconds spent discarding
+ This is the total number of milliseconds spent by all discards (as
+ measured from __make_request() to end_that_request_last()).
To avoid introducing performance bottlenecks, no locks are held while
modifying these counters. This implies that minor inaccuracies may be
diff --git a/block/Kconfig b/block/Kconfig
index eb50fd4977c2..1f2469a0123c 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -149,6 +149,18 @@ config BLK_WBT
dynamically on an algorithm loosely based on CoDel, factoring in
the realtime performance of the disk.
+config BLK_CGROUP_IOLATENCY
+ bool "Enable support for latency based cgroup IO protection"
+ depends on BLK_CGROUP=y
+ default n
+ ---help---
+ Enabling this option enables the .latency interface for IO throttling.
+ The IO controller will attempt to maintain average IO latencies below
+ the configured latency target, throttling anybody with a higher latency
+ target than the victimized group.
+
+ Note, this is an experimental interface and could be changed someday.
+
config BLK_WBT_SQ
bool "Single queue writeback throttling"
default n
@@ -177,6 +189,10 @@ config BLK_DEBUG_FS
Unless you are building a kernel for a tiny system, you should
say Y here.
+config BLK_DEBUG_FS_ZONED
+ bool
+ default BLK_DEBUG_FS && BLK_DEV_ZONED
+
config BLK_SED_OPAL
bool "Logic for interfacing with Opal enabled SEDs"
---help---
diff --git a/block/Makefile b/block/Makefile
index 6a56303b9925..572b33f32c07 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -9,7 +9,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
genhd.o partition-generic.o ioprio.o \
- badblocks.o partitions/
+ badblocks.o partitions/ blk-rq-qos.o
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o
@@ -17,6 +17,7 @@ obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
+obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
@@ -34,4 +35,5 @@ obj-$(CONFIG_BLK_MQ_RDMA) += blk-mq-rdma.o
obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o
obj-$(CONFIG_BLK_WBT) += blk-wbt.o
obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
+obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 495b9ddb3355..41d9036b1822 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -634,7 +634,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd)
* The following function returns true if every queue must receive the
* same share of the throughput (this condition is used when deciding
* whether idling may be disabled, see the comments in the function
- * bfq_bfqq_may_idle()).
+ * bfq_better_to_idle()).
*
* Such a scenario occurs when:
* 1) all active queues have the same weight,
@@ -742,8 +742,9 @@ inc_counter:
* See the comments to the function bfq_weights_tree_add() for considerations
* about overhead.
*/
-void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
- struct rb_root *root)
+void __bfq_weights_tree_remove(struct bfq_data *bfqd,
+ struct bfq_entity *entity,
+ struct rb_root *root)
{
if (!entity->weight_counter)
return;
@@ -760,6 +761,43 @@ reset_entity_pointer:
}
/*
+ * Invoke __bfq_weights_tree_remove on bfqq and all its inactive
+ * parent entities.
+ */
+void bfq_weights_tree_remove(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = bfqq->entity.parent;
+
+ __bfq_weights_tree_remove(bfqd, &bfqq->entity,
+ &bfqd->queue_weights_tree);
+
+ for_each_entity(entity) {
+ struct bfq_sched_data *sd = entity->my_sched_data;
+
+ if (sd->next_in_service || sd->in_service_entity) {
+ /*
+ * entity is still active, because either
+ * next_in_service or in_service_entity is not
+ * NULL (see the comments on the definition of
+ * next_in_service for details on why
+ * in_service_entity must be checked too).
+ *
+ * As a consequence, the weight of entity is
+ * not to be removed. In addition, if entity
+ * is active, then its parent entities are
+ * active as well, and thus their weights are
+ * not to be removed either. In the end, this
+ * loop must stop here.
+ */
+ break;
+ }
+ __bfq_weights_tree_remove(bfqd, entity,
+ &bfqd->group_weights_tree);
+ }
+}
+
+/*
* Return expired entry, or NULL to just start from scratch in rbtree.
*/
static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
@@ -1344,18 +1382,30 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
* remain unchanged after such an expiration, and the
* following statement therefore assigns to
* entity->budget the remaining budget on such an
- * expiration. For clarity, entity->service is not
- * updated on expiration in any case, and, in normal
- * operation, is reset only when bfqq is selected for
- * service (see bfq_get_next_queue).
+ * expiration.
*/
entity->budget = min_t(unsigned long,
bfq_bfqq_budget_left(bfqq),
bfqq->max_budget);
+ /*
+ * At this point, we have used entity->service to get
+ * the budget left (needed for updating
+ * entity->budget). Thus we finally can, and have to,
+ * reset entity->service. The latter must be reset
+ * because bfqq would otherwise be charged again for
+ * the service it has received during its previous
+ * service slot(s).
+ */
+ entity->service = 0;
+
return true;
}
+ /*
+ * We can finally complete expiration, by setting service to 0.
+ */
+ entity->service = 0;
entity->budget = max_t(unsigned long, bfqq->max_budget,
bfq_serv_to_charge(bfqq->next_rq, bfqq));
bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
@@ -3233,11 +3283,21 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
ref = bfqq->ref;
__bfq_bfqq_expire(bfqd, bfqq);
+ if (ref == 1) /* bfqq is gone, no more actions on it */
+ return;
+
/* mark bfqq as waiting a request only if a bic still points to it */
- if (ref > 1 && !bfq_bfqq_busy(bfqq) &&
+ if (!bfq_bfqq_busy(bfqq) &&
reason != BFQQE_BUDGET_TIMEOUT &&
- reason != BFQQE_BUDGET_EXHAUSTED)
+ reason != BFQQE_BUDGET_EXHAUSTED) {
bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
+ /*
+ * Not setting service to 0, because, if the next rq
+ * arrives in time, the queue will go on receiving
+ * service with this same budget (as if it never expired)
+ */
+ } else
+ entity->service = 0;
}
/*
@@ -3295,7 +3355,7 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
* issues taken into account are not trivial. We discuss these issues
* individually while introducing the variables.
*/
-static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
+static bool bfq_better_to_idle(struct bfq_queue *bfqq)
{
struct bfq_data *bfqd = bfqq->bfqd;
bool rot_without_queueing =
@@ -3528,19 +3588,19 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
}
/*
- * If the in-service queue is empty but the function bfq_bfqq_may_idle
+ * If the in-service queue is empty but the function bfq_better_to_idle
* returns true, then:
* 1) the queue must remain in service and cannot be expired, and
* 2) the device must be idled to wait for the possible arrival of a new
* request for the queue.
- * See the comments on the function bfq_bfqq_may_idle for the reasons
+ * See the comments on the function bfq_better_to_idle for the reasons
* why performing device idling is the best choice to boost the throughput
- * and preserve service guarantees when bfq_bfqq_may_idle itself
+ * and preserve service guarantees when bfq_better_to_idle itself
* returns true.
*/
static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
{
- return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq);
+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq);
}
/*
@@ -3559,8 +3619,14 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
+ /*
+ * Do not expire bfqq for budget timeout if bfqq may be about
+ * to enjoy device idling. The reason why, in this case, we
+ * prevent bfqq from expiring is the same as in the comments
+ * on the case where bfq_bfqq_must_idle() returns true, in
+ * bfq_completed_request().
+ */
if (bfq_may_expire_for_budg_timeout(bfqq) &&
- !bfq_bfqq_wait_request(bfqq) &&
!bfq_bfqq_must_idle(bfqq))
goto expire;
@@ -3620,7 +3686,7 @@ check_queue:
* may idle after their completion, then keep it anyway.
*/
if (bfq_bfqq_wait_request(bfqq) ||
- (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
+ (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
bfqq = NULL;
goto keep_queue;
}
@@ -4582,8 +4648,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
*/
bfqq->budget_timeout = jiffies;
- bfq_weights_tree_remove(bfqd, &bfqq->entity,
- &bfqd->queue_weights_tree);
+ bfq_weights_tree_remove(bfqd, bfqq);
}
now_ns = ktime_get_ns();
@@ -4637,15 +4702,39 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
* or if we want to idle in case it has no pending requests.
*/
if (bfqd->in_service_queue == bfqq) {
- if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
- bfq_arm_slice_timer(bfqd);
+ if (bfq_bfqq_must_idle(bfqq)) {
+ if (bfqq->dispatched == 0)
+ bfq_arm_slice_timer(bfqd);
+ /*
+ * If we get here, we do not expire bfqq, even
+ * if bfqq was in budget timeout or had no
+ * more requests (as controlled in the next
+ * conditional instructions). The reason for
+ * not expiring bfqq is as follows.
+ *
+ * Here bfqq->dispatched > 0 holds, but
+ * bfq_bfqq_must_idle() returned true. This
+ * implies that, even if no request arrives
+ * for bfqq before bfqq->dispatched reaches 0,
+ * bfqq will, however, not be expired on the
+ * completion event that causes bfqq->dispatch
+ * to reach zero. In contrast, on this event,
+ * bfqq will start enjoying device idling
+ * (I/O-dispatch plugging).
+ *
+ * But, if we expired bfqq here, bfqq would
+ * not have the chance to enjoy device idling
+ * when bfqq->dispatched finally reaches
+ * zero. This would expose bfqq to violation
+ * of its reserved service guarantees.
+ */
return;
} else if (bfq_may_expire_for_budg_timeout(bfqq))
bfq_bfqq_expire(bfqd, bfqq, false,
BFQQE_BUDGET_TIMEOUT);
else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
(bfqq->dispatched == 0 ||
- !bfq_bfqq_may_idle(bfqq)))
+ !bfq_better_to_idle(bfqq)))
bfq_bfqq_expire(bfqd, bfqq, false,
BFQQE_NO_MORE_REQUESTS);
}
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 0f712e03b035..a8a2e5aca4d4 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -827,8 +827,11 @@ struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
struct rb_root *root);
-void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
- struct rb_root *root);
+void __bfq_weights_tree_remove(struct bfq_data *bfqd,
+ struct bfq_entity *entity,
+ struct rb_root *root);
+void bfq_weights_tree_remove(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq);
void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool compensate, enum bfqq_expiration reason);
void bfq_put_queue(struct bfq_queue *bfqq);
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 4498c43245e2..dbc07b456059 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -499,9 +499,6 @@ static void bfq_active_insert(struct bfq_service_tree *st,
if (bfqq)
list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
- else /* bfq_group */
- bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
-
if (bfqg != bfqd->root_group)
bfqg->active_entities++;
#endif
@@ -601,10 +598,6 @@ static void bfq_active_extract(struct bfq_service_tree *st,
if (bfqq)
list_del(&bfqq->bfqq_list);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
- else /* bfq_group */
- bfq_weights_tree_remove(bfqd, entity,
- &bfqd->group_weights_tree);
-
if (bfqg != bfqd->root_group)
bfqg->active_entities--;
#endif
@@ -799,7 +792,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
if (prev_weight != new_weight) {
root = bfqq ? &bfqd->queue_weights_tree :
&bfqd->group_weights_tree;
- bfq_weights_tree_remove(bfqd, entity, root);
+ __bfq_weights_tree_remove(bfqd, entity, root);
}
entity->weight = new_weight;
/*
@@ -971,7 +964,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
* one of its children receives a new request.
*
* Basically, this function updates the timestamps of entity and
- * inserts entity into its active tree, ater possibly extracting it
+ * inserts entity into its active tree, after possibly extracting it
* from its idle tree.
*/
static void __bfq_activate_entity(struct bfq_entity *entity,
@@ -1015,6 +1008,16 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
entity->on_st = true;
}
+#ifdef BFQ_GROUP_IOSCHED_ENABLED
+ if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */
+ struct bfq_group *bfqg =
+ container_of(entity, struct bfq_group, entity);
+
+ bfq_weights_tree_add(bfqg->bfqd, entity,
+ &bfqd->group_weights_tree);
+ }
+#endif
+
bfq_update_fin_time_enqueue(entity, st, backshifted);
}
@@ -1542,12 +1545,6 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
sd->in_service_entity = entity;
/*
- * Reset the accumulator of the amount of service that
- * the entity is about to receive.
- */
- entity->service = 0;
-
- /*
* If entity is no longer a candidate for next
* service, then it must be extracted from its active
* tree, so as to make sure that it won't be
@@ -1664,8 +1661,7 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqd->busy_queues--;
if (!bfqq->dispatched)
- bfq_weights_tree_remove(bfqd, &bfqq->entity,
- &bfqd->queue_weights_tree);
+ bfq_weights_tree_remove(bfqd, bfqq);
if (bfqq->wr_coeff > 1)
bfqd->wr_busy_queues--;
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index add7c7c85335..67b5fb861a51 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -160,28 +160,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
EXPORT_SYMBOL(bio_integrity_add_page);
/**
- * bio_integrity_intervals - Return number of integrity intervals for a bio
- * @bi: blk_integrity profile for device
- * @sectors: Size of the bio in 512-byte sectors
- *
- * Description: The block layer calculates everything in 512 byte
- * sectors but integrity metadata is done in terms of the data integrity
- * interval size of the storage device. Convert the block layer sectors
- * to the appropriate number of integrity intervals.
- */
-static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
- unsigned int sectors)
-{
- return sectors >> (bi->interval_exp - 9);
-}
-
-static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
- unsigned int sectors)
-{
- return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
-}
-
-/**
* bio_integrity_process - Process integrity metadata for a bio
* @bio: bio to generate/verify integrity metadata for
* @proc_iter: iterator to process
diff --git a/block/bio.c b/block/bio.c
index 047c5dca6d90..b12966e415d3 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -28,9 +28,11 @@
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
+#include <linux/blk-cgroup.h>
#include <trace/events/block.h>
#include "blk.h"
+#include "blk-rq-qos.h"
/*
* Test patch to inline a certain number of bi_io_vec's inside the bio
@@ -156,7 +158,7 @@ out:
unsigned int bvec_nr_vecs(unsigned short idx)
{
- return bvec_slabs[idx].nr_vecs;
+ return bvec_slabs[--idx].nr_vecs;
}
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
@@ -645,83 +647,6 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
EXPORT_SYMBOL(bio_clone_fast);
/**
- * bio_clone_bioset - clone a bio
- * @bio_src: bio to clone
- * @gfp_mask: allocation priority
- * @bs: bio_set to allocate from
- *
- * Clone bio. Caller will own the returned bio, but not the actual data it
- * points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
- struct bio_set *bs)
-{
- struct bvec_iter iter;
- struct bio_vec bv;
- struct bio *bio;
-
- /*
- * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
- * bio_src->bi_io_vec to bio->bi_io_vec.
- *
- * We can't do that anymore, because:
- *
- * - The point of cloning the biovec is to produce a bio with a biovec
- * the caller can modify: bi_idx and bi_bvec_done should be 0.
- *
- * - The original bio could've had more than BIO_MAX_PAGES biovecs; if
- * we tried to clone the whole thing bio_alloc_bioset() would fail.
- * But the clone should succeed as long as the number of biovecs we
- * actually need to allocate is fewer than BIO_MAX_PAGES.
- *
- * - Lastly, bi_vcnt should not be looked at or relied upon by code
- * that does not own the bio - reason being drivers don't use it for
- * iterating over the biovec anymore, so expecting it to be kept up
- * to date (i.e. for clones that share the parent biovec) is just
- * asking for trouble and would force extra work on
- * __bio_clone_fast() anyways.
- */
-
- bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
- if (!bio)
- return NULL;
- bio->bi_disk = bio_src->bi_disk;
- bio->bi_opf = bio_src->bi_opf;
- bio->bi_write_hint = bio_src->bi_write_hint;
- bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
- bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
-
- switch (bio_op(bio)) {
- case REQ_OP_DISCARD:
- case REQ_OP_SECURE_ERASE:
- case REQ_OP_WRITE_ZEROES:
- break;
- case REQ_OP_WRITE_SAME:
- bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
- break;
- default:
- bio_for_each_segment(bv, bio_src, iter)
- bio->bi_io_vec[bio->bi_vcnt++] = bv;
- break;
- }
-
- if (bio_integrity(bio_src)) {
- int ret;
-
- ret = bio_integrity_clone(bio, bio_src, gfp_mask);
- if (ret < 0) {
- bio_put(bio);
- return NULL;
- }
- }
-
- bio_clone_blkcg_association(bio, bio_src);
-
- return bio;
-}
-EXPORT_SYMBOL(bio_clone_bioset);
-
-/**
* bio_add_pc_page - attempt to add page to bio
* @q: the target queue
* @bio: destination bio
@@ -1661,10 +1586,8 @@ void bio_set_pages_dirty(struct bio *bio)
int i;
bio_for_each_segment_all(bvec, bio, i) {
- struct page *page = bvec->bv_page;
-
- if (page && !PageCompound(page))
- set_page_dirty_lock(page);
+ if (!PageCompound(bvec->bv_page))
+ set_page_dirty_lock(bvec->bv_page);
}
}
EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
@@ -1674,19 +1597,15 @@ static void bio_release_pages(struct bio *bio)
struct bio_vec *bvec;
int i;
- bio_for_each_segment_all(bvec, bio, i) {
- struct page *page = bvec->bv_page;
-
- if (page)
- put_page(page);
- }
+ bio_for_each_segment_all(bvec, bio, i)
+ put_page(bvec->bv_page);
}
/*
* bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
* If they are, then fine. If, however, some pages are clean then they must
* have been written out during the direct-IO read. So we take another ref on
- * the BIO and the offending pages and re-dirty the pages in process context.
+ * the BIO and re-dirty the pages in process context.
*
* It is expected that bio_check_pages_dirty() will wholly own the BIO from
* here on. It will run one put_page() against each page and will run one
@@ -1704,78 +1623,70 @@ static struct bio *bio_dirty_list;
*/
static void bio_dirty_fn(struct work_struct *work)
{
- unsigned long flags;
- struct bio *bio;
+ struct bio *bio, *next;
- spin_lock_irqsave(&bio_dirty_lock, flags);
- bio = bio_dirty_list;
+ spin_lock_irq(&bio_dirty_lock);
+ next = bio_dirty_list;
bio_dirty_list = NULL;
- spin_unlock_irqrestore(&bio_dirty_lock, flags);
+ spin_unlock_irq(&bio_dirty_lock);
- while (bio) {
- struct bio *next = bio->bi_private;
+ while ((bio = next) != NULL) {
+ next = bio->bi_private;
bio_set_pages_dirty(bio);
bio_release_pages(bio);
bio_put(bio);
- bio = next;
}
}
void bio_check_pages_dirty(struct bio *bio)
{
struct bio_vec *bvec;
- int nr_clean_pages = 0;
+ unsigned long flags;
int i;
bio_for_each_segment_all(bvec, bio, i) {
- struct page *page = bvec->bv_page;
-
- if (PageDirty(page) || PageCompound(page)) {
- put_page(page);
- bvec->bv_page = NULL;
- } else {
- nr_clean_pages++;
- }
+ if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
+ goto defer;
}
- if (nr_clean_pages) {
- unsigned long flags;
-
- spin_lock_irqsave(&bio_dirty_lock, flags);
- bio->bi_private = bio_dirty_list;
- bio_dirty_list = bio;
- spin_unlock_irqrestore(&bio_dirty_lock, flags);
- schedule_work(&bio_dirty_work);
- } else {
- bio_put(bio);
- }
+ bio_release_pages(bio);
+ bio_put(bio);
+ return;
+defer:
+ spin_lock_irqsave(&bio_dirty_lock, flags);
+ bio->bi_private = bio_dirty_list;
+ bio_dirty_list = bio;
+ spin_unlock_irqrestore(&bio_dirty_lock, flags);
+ schedule_work(&bio_dirty_work);
}
EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
-void generic_start_io_acct(struct request_queue *q, int rw,
+void generic_start_io_acct(struct request_queue *q, int op,
unsigned long sectors, struct hd_struct *part)
{
+ const int sgrp = op_stat_group(op);
int cpu = part_stat_lock();
part_round_stats(q, cpu, part);
- part_stat_inc(cpu, part, ios[rw]);
- part_stat_add(cpu, part, sectors[rw], sectors);
- part_inc_in_flight(q, part, rw);
+ part_stat_inc(cpu, part, ios[sgrp]);
+ part_stat_add(cpu, part, sectors[sgrp], sectors);
+ part_inc_in_flight(q, part, op_is_write(op));
part_stat_unlock();
}
EXPORT_SYMBOL(generic_start_io_acct);
-void generic_end_io_acct(struct request_queue *q, int rw,
+void generic_end_io_acct(struct request_queue *q, int req_op,
struct hd_struct *part, unsigned long start_time)
{
unsigned long duration = jiffies - start_time;
+ const int sgrp = op_stat_group(req_op);
int cpu = part_stat_lock();
- part_stat_add(cpu, part, ticks[rw], duration);
+ part_stat_add(cpu, part, ticks[sgrp], duration);
part_round_stats(q, cpu, part);
- part_dec_in_flight(q, part, rw);
+ part_dec_in_flight(q, part, op_is_write(req_op));
part_stat_unlock();
}
@@ -1834,6 +1745,9 @@ again:
if (!bio_integrity_endio(bio))
return;
+ if (bio->bi_disk)
+ rq_qos_done_bio(bio->bi_disk->queue, bio);
+
/*
* Need to have a real endio function for chained bios, otherwise
* various corner cases will break (like stacking block devices that
@@ -2042,6 +1956,30 @@ EXPORT_SYMBOL(bioset_init_from_src);
#ifdef CONFIG_BLK_CGROUP
+#ifdef CONFIG_MEMCG
+/**
+ * bio_associate_blkcg_from_page - associate a bio with the page's blkcg
+ * @bio: target bio
+ * @page: the page to lookup the blkcg from
+ *
+ * Associate @bio with the blkcg from @page's owning memcg. This works like
+ * every other associate function wrt references.
+ */
+int bio_associate_blkcg_from_page(struct bio *bio, struct page *page)
+{
+ struct cgroup_subsys_state *blkcg_css;
+
+ if (unlikely(bio->bi_css))
+ return -EBUSY;
+ if (!page->mem_cgroup)
+ return 0;
+ blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup,
+ &io_cgrp_subsys);
+ bio->bi_css = blkcg_css;
+ return 0;
+}
+#endif /* CONFIG_MEMCG */
+
/**
* bio_associate_blkcg - associate a bio with the specified blkcg
* @bio: target bio
@@ -2065,6 +2003,24 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
EXPORT_SYMBOL_GPL(bio_associate_blkcg);
/**
+ * bio_associate_blkg - associate a bio with the specified blkg
+ * @bio: target bio
+ * @blkg: the blkg to associate
+ *
+ * Associate @bio with the blkg specified by @blkg. This is the queue specific
+ * blkcg information associated with the @bio, a reference will be taken on the
+ * @blkg and will be freed when the bio is freed.
+ */
+int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
+{
+ if (unlikely(bio->bi_blkg))
+ return -EBUSY;
+ blkg_get(blkg);
+ bio->bi_blkg = blkg;
+ return 0;
+}
+
+/**
* bio_disassociate_task - undo bio_associate_current()
* @bio: target bio
*/
@@ -2078,6 +2034,10 @@ void bio_disassociate_task(struct bio *bio)
css_put(bio->bi_css);
bio->bi_css = NULL;
}
+ if (bio->bi_blkg) {
+ blkg_put(bio->bi_blkg);
+ bio->bi_blkg = NULL;
+ }
}
/**
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index eb85cb87c40f..694595b29b8f 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -27,6 +27,7 @@
#include <linux/atomic.h>
#include <linux/ctype.h>
#include <linux/blk-cgroup.h>
+#include <linux/tracehook.h>
#include "blk.h"
#define MAX_KEY_LEN 100
@@ -50,6 +51,8 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
+static bool blkcg_debug_stats = false;
+
static bool blkcg_policy_enabled(struct request_queue *q,
const struct blkcg_policy *pol)
{
@@ -564,6 +567,7 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
[BLKG_RWSTAT_WRITE] = "Write",
[BLKG_RWSTAT_SYNC] = "Sync",
[BLKG_RWSTAT_ASYNC] = "Async",
+ [BLKG_RWSTAT_DISCARD] = "Discard",
};
const char *dname = blkg_dev_name(pd->blkg);
u64 v;
@@ -577,7 +581,8 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
(unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
- atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
+ atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) +
+ atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]);
seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
return v;
}
@@ -954,30 +959,77 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
const char *dname;
+ char *buf;
struct blkg_rwstat rwstat;
- u64 rbytes, wbytes, rios, wios;
+ u64 rbytes, wbytes, rios, wios, dbytes, dios;
+ size_t size = seq_get_buf(sf, &buf), off = 0;
+ int i;
+ bool has_stats = false;
dname = blkg_dev_name(blkg);
if (!dname)
continue;
+ /*
+ * Hooray string manipulation, count is the size written NOT
+ * INCLUDING THE \0, so size is now count+1 less than what we
+ * had before, but we want to start writing the next bit from
+ * the \0 so we only add count to buf.
+ */
+ off += scnprintf(buf+off, size-off, "%s ", dname);
+
spin_lock_irq(blkg->q->queue_lock);
rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
offsetof(struct blkcg_gq, stat_bytes));
rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+ dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
offsetof(struct blkcg_gq, stat_ios));
rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+ dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
spin_unlock_irq(blkg->q->queue_lock);
- if (rbytes || wbytes || rios || wios)
- seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n",
- dname, rbytes, wbytes, rios, wios);
+ if (rbytes || wbytes || rios || wios) {
+ has_stats = true;
+ off += scnprintf(buf+off, size-off,
+ "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
+ rbytes, wbytes, rios, wios,
+ dbytes, dios);
+ }
+
+ if (!blkcg_debug_stats)
+ goto next;
+
+ if (atomic_read(&blkg->use_delay)) {
+ has_stats = true;
+ off += scnprintf(buf+off, size-off,
+ " use_delay=%d delay_nsec=%llu",
+ atomic_read(&blkg->use_delay),
+ (unsigned long long)atomic64_read(&blkg->delay_nsec));
+ }
+
+ for (i = 0; i < BLKCG_MAX_POLS; i++) {
+ struct blkcg_policy *pol = blkcg_policy[i];
+ size_t written;
+
+ if (!blkg->pd[i] || !pol->pd_stat_fn)
+ continue;
+
+ written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
+ if (written)
+ has_stats = true;
+ off += written;
+ }
+next:
+ if (has_stats) {
+ off += scnprintf(buf+off, size-off, "\n");
+ seq_commit(sf, off);
+ }
}
rcu_read_unlock();
@@ -1191,6 +1243,14 @@ int blkcg_init_queue(struct request_queue *q)
if (preloaded)
radix_tree_preload_end();
+ ret = blk_iolatency_init(q);
+ if (ret) {
+ spin_lock_irq(q->queue_lock);
+ blkg_destroy_all(q);
+ spin_unlock_irq(q->queue_lock);
+ return ret;
+ }
+
ret = blk_throtl_init(q);
if (ret) {
spin_lock_irq(q->queue_lock);
@@ -1288,6 +1348,13 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css)
mutex_unlock(&blkcg_pol_mutex);
}
+static void blkcg_exit(struct task_struct *tsk)
+{
+ if (tsk->throttle_queue)
+ blk_put_queue(tsk->throttle_queue);
+ tsk->throttle_queue = NULL;
+}
+
struct cgroup_subsys io_cgrp_subsys = {
.css_alloc = blkcg_css_alloc,
.css_offline = blkcg_css_offline,
@@ -1297,6 +1364,7 @@ struct cgroup_subsys io_cgrp_subsys = {
.dfl_cftypes = blkcg_files,
.legacy_cftypes = blkcg_legacy_files,
.legacy_name = "blkio",
+ .exit = blkcg_exit,
#ifdef CONFIG_MEMCG
/*
* This ensures that, if available, memcg is automatically enabled
@@ -1547,3 +1615,209 @@ out_unlock:
mutex_unlock(&blkcg_pol_register_mutex);
}
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
+
+/*
+ * Scale the accumulated delay based on how long it has been since we updated
+ * the delay. We only call this when we are adding delay, in case it's been a
+ * while since we added delay, and when we are checking to see if we need to
+ * delay a task, to account for any delays that may have occurred.
+ */
+static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
+{
+ u64 old = atomic64_read(&blkg->delay_start);
+
+ /*
+ * We only want to scale down every second. The idea here is that we
+ * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
+ * time window. We only want to throttle tasks for recent delay that
+ * has occurred, in 1 second time windows since that's the maximum
+ * things can be throttled. We save the current delay window in
+ * blkg->last_delay so we know what amount is still left to be charged
+ * to the blkg from this point onward. blkg->last_use keeps track of
+ * the use_delay counter. The idea is if we're unthrottling the blkg we
+ * are ok with whatever is happening now, and we can take away more of
+ * the accumulated delay as we've already throttled enough that
+ * everybody is happy with their IO latencies.
+ */
+ if (time_before64(old + NSEC_PER_SEC, now) &&
+ atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
+ u64 cur = atomic64_read(&blkg->delay_nsec);
+ u64 sub = min_t(u64, blkg->last_delay, now - old);
+ int cur_use = atomic_read(&blkg->use_delay);
+
+ /*
+ * We've been unthrottled, subtract a larger chunk of our
+ * accumulated delay.
+ */
+ if (cur_use < blkg->last_use)
+ sub = max_t(u64, sub, blkg->last_delay >> 1);
+
+ /*
+ * This shouldn't happen, but handle it anyway. Our delay_nsec
+ * should only ever be growing except here where we subtract out
+ * min(last_delay, 1 second), but lord knows bugs happen and I'd
+ * rather not end up with negative numbers.
+ */
+ if (unlikely(cur < sub)) {
+ atomic64_set(&blkg->delay_nsec, 0);
+ blkg->last_delay = 0;
+ } else {
+ atomic64_sub(sub, &blkg->delay_nsec);
+ blkg->last_delay = cur - sub;
+ }
+ blkg->last_use = cur_use;
+ }
+}
+
+/*
+ * This is called when we want to actually walk up the hierarchy and check to
+ * see if we need to throttle, and then actually throttle if there is some
+ * accumulated delay. This should only be called upon return to user space so
+ * we're not holding some lock that would induce a priority inversion.
+ */
+static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
+{
+ u64 now = ktime_to_ns(ktime_get());
+ u64 exp;
+ u64 delay_nsec = 0;
+ int tok;
+
+ while (blkg->parent) {
+ if (atomic_read(&blkg->use_delay)) {
+ blkcg_scale_delay(blkg, now);
+ delay_nsec = max_t(u64, delay_nsec,
+ atomic64_read(&blkg->delay_nsec));
+ }
+ blkg = blkg->parent;
+ }
+
+ if (!delay_nsec)
+ return;
+
+ /*
+ * Let's not sleep for all eternity if we've amassed a huge delay.
+ * Swapping or metadata IO can accumulate 10's of seconds worth of
+ * delay, and we want userspace to be able to do _something_ so cap the
+ * delays at 1 second. If there's 10's of seconds worth of delay then
+ * the tasks will be delayed for 1 second for every syscall.
+ */
+ delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
+
+ /*
+ * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
+ * that hasn't landed upstream yet. Once that stuff is in place we need
+ * to do a psi_memstall_enter/leave if memdelay is set.
+ */
+
+ exp = ktime_add_ns(now, delay_nsec);
+ tok = io_schedule_prepare();
+ do {
+ __set_current_state(TASK_KILLABLE);
+ if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
+ break;
+ } while (!fatal_signal_pending(current));
+ io_schedule_finish(tok);
+}
+
+/**
+ * blkcg_maybe_throttle_current - throttle the current task if it has been marked
+ *
+ * This is only called if we've been marked with set_notify_resume(). Obviously
+ * we can be set_notify_resume() for reasons other than blkcg throttling, so we
+ * check to see if current->throttle_queue is set and if not this doesn't do
+ * anything. This should only ever be called by the resume code, it's not meant
+ * to be called by people willy-nilly as it will actually do the work to
+ * throttle the task if it is setup for throttling.
+ */
+void blkcg_maybe_throttle_current(void)
+{
+ struct request_queue *q = current->throttle_queue;
+ struct cgroup_subsys_state *css;
+ struct blkcg *blkcg;
+ struct blkcg_gq *blkg;
+ bool use_memdelay = current->use_memdelay;
+
+ if (!q)
+ return;
+
+ current->throttle_queue = NULL;
+ current->use_memdelay = false;
+
+ rcu_read_lock();
+ css = kthread_blkcg();
+ if (css)
+ blkcg = css_to_blkcg(css);
+ else
+ blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
+
+ if (!blkcg)
+ goto out;
+ blkg = blkg_lookup(blkcg, q);
+ if (!blkg)
+ goto out;
+ blkg = blkg_try_get(blkg);
+ if (!blkg)
+ goto out;
+ rcu_read_unlock();
+
+ blkcg_maybe_throttle_blkg(blkg, use_memdelay);
+ blkg_put(blkg);
+ blk_put_queue(q);
+ return;
+out:
+ rcu_read_unlock();
+ blk_put_queue(q);
+}
+EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
+
+/**
+ * blkcg_schedule_throttle - this task needs to check for throttling
+ * @q - the request queue IO was submitted on
+ * @use_memdelay - do we charge this to memory delay for PSI
+ *
+ * This is called by the IO controller when we know there's delay accumulated
+ * for the blkg for this task. We do not pass the blkg because there are places
+ * we call this that may not have that information, the swapping code for
+ * instance will only have a request_queue at that point. This set's the
+ * notify_resume for the task to check and see if it requires throttling before
+ * returning to user space.
+ *
+ * We will only schedule once per syscall. You can call this over and over
+ * again and it will only do the check once upon return to user space, and only
+ * throttle once. If the task needs to be throttled again it'll need to be
+ * re-set at the next time we see the task.
+ */
+void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
+{
+ if (unlikely(current->flags & PF_KTHREAD))
+ return;
+
+ if (!blk_get_queue(q))
+ return;
+
+ if (current->throttle_queue)
+ blk_put_queue(current->throttle_queue);
+ current->throttle_queue = q;
+ if (use_memdelay)
+ current->use_memdelay = use_memdelay;
+ set_notify_resume(current);
+}
+EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
+
+/**
+ * blkcg_add_delay - add delay to this blkg
+ * @now - the current time in nanoseconds
+ * @delta - how many nanoseconds of delay to add
+ *
+ * Charge @delta to the blkg's current delay accumulation. This is used to
+ * throttle tasks if an IO controller thinks we need more throttling.
+ */
+void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
+{
+ blkcg_scale_delay(blkg, now);
+ atomic64_add(delta, &blkg->delay_nsec);
+}
+EXPORT_SYMBOL_GPL(blkcg_add_delay);
+
+module_param(blkcg_debug_stats, bool, 0644);
+MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/block/blk-core.c b/block/blk-core.c
index ee33590f54eb..12550340418d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -42,7 +42,7 @@
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-sched.h"
-#include "blk-wbt.h"
+#include "blk-rq-qos.h"
#ifdef CONFIG_DEBUG_FS
struct dentry *blk_debugfs_root;
@@ -715,6 +715,35 @@ void blk_set_queue_dying(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_set_queue_dying);
+/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
+void blk_exit_queue(struct request_queue *q)
+{
+ /*
+ * Since the I/O scheduler exit code may access cgroup information,
+ * perform I/O scheduler exit before disassociating from the block
+ * cgroup controller.
+ */
+ if (q->elevator) {
+ ioc_clear_queue(q);
+ elevator_exit(q, q->elevator);
+ q->elevator = NULL;
+ }
+
+ /*
+ * Remove all references to @q from the block cgroup controller before
+ * restoring @q->queue_lock to avoid that restoring this pointer causes
+ * e.g. blkcg_print_blkgs() to crash.
+ */
+ blkcg_exit_queue(q);
+
+ /*
+ * Since the cgroup code may dereference the @q->backing_dev_info
+ * pointer, only decrease its reference count after having removed the
+ * association with the block cgroup controller.
+ */
+ bdi_put(q->backing_dev_info);
+}
+
/**
* blk_cleanup_queue - shutdown a request queue
* @q: request queue to shutdown
@@ -762,9 +791,13 @@ void blk_cleanup_queue(struct request_queue *q)
* make sure all in-progress dispatch are completed because
* blk_freeze_queue() can only complete all requests, and
* dispatch may still be in-progress since we dispatch requests
- * from more than one contexts
+ * from more than one contexts.
+ *
+ * No need to quiesce queue if it isn't initialized yet since
+ * blk_freeze_queue() should be enough for cases of passthrough
+ * request.
*/
- if (q->mq_ops)
+ if (q->mq_ops && blk_queue_init_done(q))
blk_mq_quiesce_queue(q);
/* for synchronous bio-based driver finish in-flight integrity i/o */
@@ -780,30 +813,7 @@ void blk_cleanup_queue(struct request_queue *q)
*/
WARN_ON_ONCE(q->kobj.state_in_sysfs);
- /*
- * Since the I/O scheduler exit code may access cgroup information,
- * perform I/O scheduler exit before disassociating from the block
- * cgroup controller.
- */
- if (q->elevator) {
- ioc_clear_queue(q);
- elevator_exit(q, q->elevator);
- q->elevator = NULL;
- }
-
- /*
- * Remove all references to @q from the block cgroup controller before
- * restoring @q->queue_lock to avoid that restoring this pointer causes
- * e.g. blkcg_print_blkgs() to crash.
- */
- blkcg_exit_queue(q);
-
- /*
- * Since the cgroup code may dereference the @q->backing_dev_info
- * pointer, only decrease its reference count after having removed the
- * association with the block cgroup controller.
- */
- bdi_put(q->backing_dev_info);
+ blk_exit_queue(q);
if (q->mq_ops)
blk_mq_free_queue(q);
@@ -1180,6 +1190,7 @@ out_exit_flush_rq:
q->exit_rq_fn(q, q->fq->flush_rq);
out_free_flush_queue:
blk_free_flush_queue(q->fq);
+ q->fq = NULL;
return -ENOMEM;
}
EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1641,7 +1652,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
blk_delete_timer(rq);
blk_clear_rq_complete(rq);
trace_block_rq_requeue(q, rq);
- wbt_requeue(q->rq_wb, rq);
+ rq_qos_requeue(q, rq);
if (rq->rq_flags & RQF_QUEUED)
blk_queue_end_tag(q, rq);
@@ -1748,7 +1759,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
/* this is a bio leak */
WARN_ON(req->bio != NULL);
- wbt_done(q->rq_wb, req);
+ rq_qos_done(q, req);
/*
* Request may not have originated from ll_rw_blk. if not,
@@ -1982,7 +1993,6 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
int where = ELEVATOR_INSERT_SORT;
struct request *req, *free;
unsigned int request_count = 0;
- unsigned int wb_acct;
/*
* low level driver can indicate that it wants pages above a
@@ -2040,7 +2050,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
}
get_rq:
- wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock);
+ rq_qos_throttle(q, bio, q->queue_lock);
/*
* Grab a free request. This is might sleep but can not fail.
@@ -2050,7 +2060,7 @@ get_rq:
req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO);
if (IS_ERR(req)) {
blk_queue_exit(q);
- __wbt_done(q->rq_wb, wb_acct);
+ rq_qos_cleanup(q, bio);
if (PTR_ERR(req) == -ENOMEM)
bio->bi_status = BLK_STS_RESOURCE;
else
@@ -2059,7 +2069,7 @@ get_rq:
goto out_unlock;
}
- wbt_track(req, wb_acct);
+ rq_qos_track(q, req, bio);
/*
* After dropping the lock and possibly sleeping here, our request
@@ -2700,13 +2710,13 @@ EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
void blk_account_io_completion(struct request *req, unsigned int bytes)
{
if (blk_do_io_stat(req)) {
- const int rw = rq_data_dir(req);
+ const int sgrp = op_stat_group(req_op(req));
struct hd_struct *part;
int cpu;
cpu = part_stat_lock();
part = req->part;
- part_stat_add(cpu, part, sectors[rw], bytes >> 9);
+ part_stat_add(cpu, part, sectors[sgrp], bytes >> 9);
part_stat_unlock();
}
}
@@ -2720,7 +2730,7 @@ void blk_account_io_done(struct request *req, u64 now)
*/
if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
unsigned long duration;
- const int rw = rq_data_dir(req);
+ const int sgrp = op_stat_group(req_op(req));
struct hd_struct *part;
int cpu;
@@ -2728,10 +2738,10 @@ void blk_account_io_done(struct request *req, u64 now)
cpu = part_stat_lock();
part = req->part;
- part_stat_inc(cpu, part, ios[rw]);
- part_stat_add(cpu, part, ticks[rw], duration);
+ part_stat_inc(cpu, part, ios[sgrp]);
+ part_stat_add(cpu, part, ticks[sgrp], duration);
part_round_stats(req->q, cpu, part);
- part_dec_in_flight(req->q, part, rw);
+ part_dec_in_flight(req->q, part, rq_data_dir(req));
hd_struct_put(part);
part_stat_unlock();
@@ -2751,9 +2761,9 @@ static bool blk_pm_allow_request(struct request *rq)
return rq->rq_flags & RQF_PM;
case RPM_SUSPENDED:
return false;
+ default:
+ return true;
}
-
- return true;
}
#else
static bool blk_pm_allow_request(struct request *rq)
@@ -2980,7 +2990,7 @@ void blk_start_request(struct request *req)
req->throtl_size = blk_rq_sectors(req);
#endif
req->rq_flags |= RQF_STATS;
- wbt_issue(req->q->rq_wb, req);
+ rq_qos_issue(req->q, req);
}
BUG_ON(blk_rq_is_complete(req));
@@ -3053,6 +3063,10 @@ EXPORT_SYMBOL_GPL(blk_steal_bios);
* Passing the result of blk_rq_bytes() as @nr_bytes guarantees
* %false return from this function.
*
+ * Note:
+ * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both
+ * blk_rq_bytes() and in blk_update_request().
+ *
* Return:
* %false - this request doesn't have any more data
* %true - this request has more data
@@ -3200,7 +3214,7 @@ void blk_finish_request(struct request *req, blk_status_t error)
blk_account_io_done(req, now);
if (req->end_io) {
- wbt_done(req->q->rq_wb, req);
+ rq_qos_done(q, req);
req->end_io(req, error);
} else {
if (blk_bidi_rq(req))
@@ -3763,9 +3777,11 @@ EXPORT_SYMBOL(blk_finish_plug);
*/
void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
{
- /* not support for RQF_PM and ->rpm_status in blk-mq yet */
- if (q->mq_ops)
+ /* Don't enable runtime PM for blk-mq until it is ready */
+ if (q->mq_ops) {
+ pm_runtime_disable(dev);
return;
+ }
q->dev = dev;
q->rpm_status = RPM_ACTIVE;
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index f23311e4b201..01580f88fcb3 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -278,7 +278,7 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
atomic_set(&ioc->nr_tasks, 1);
atomic_set(&ioc->active_ref, 1);
spin_lock_init(&ioc->lock);
- INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
+ INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC);
INIT_HLIST_HEAD(&ioc->icq_list);
INIT_WORK(&ioc->release_work, ioc_release_fn);
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
new file mode 100644
index 000000000000..19923f8a029d
--- /dev/null
+++ b/block/blk-iolatency.c
@@ -0,0 +1,955 @@
+/*
+ * Block rq-qos base io controller
+ *
+ * This works similar to wbt with a few exceptions
+ *
+ * - It's bio based, so the latency covers the whole block layer in addition to
+ * the actual io.
+ * - We will throttle all IO that comes in here if we need to.
+ * - We use the mean latency over the 100ms window. This is because writes can
+ * be particularly fast, which could give us a false sense of the impact of
+ * other workloads on our protected workload.
+ * - By default there's no throttling, we set the queue_depth to UINT_MAX so
+ * that we can have as many outstanding bio's as we're allowed to. Only at
+ * throttle time do we pay attention to the actual queue depth.
+ *
+ * The hierarchy works like the cpu controller does, we track the latency at
+ * every configured node, and each configured node has it's own independent
+ * queue depth. This means that we only care about our latency targets at the
+ * peer level. Some group at the bottom of the hierarchy isn't going to affect
+ * a group at the end of some other path if we're only configred at leaf level.
+ *
+ * Consider the following
+ *
+ * root blkg
+ * / \
+ * fast (target=5ms) slow (target=10ms)
+ * / \ / \
+ * a b normal(15ms) unloved
+ *
+ * "a" and "b" have no target, but their combined io under "fast" cannot exceed
+ * an average latency of 5ms. If it does then we will throttle the "slow"
+ * group. In the case of "normal", if it exceeds its 15ms target, we will
+ * throttle "unloved", but nobody else.
+ *
+ * In this example "fast", "slow", and "normal" will be the only groups actually
+ * accounting their io latencies. We have to walk up the heirarchy to the root
+ * on every submit and complete so we can do the appropriate stat recording and
+ * adjust the queue depth of ourselves if needed.
+ *
+ * There are 2 ways we throttle IO.
+ *
+ * 1) Queue depth throttling. As we throttle down we will adjust the maximum
+ * number of IO's we're allowed to have in flight. This starts at (u64)-1 down
+ * to 1. If the group is only ever submitting IO for itself then this is the
+ * only way we throttle.
+ *
+ * 2) Induced delay throttling. This is for the case that a group is generating
+ * IO that has to be issued by the root cg to avoid priority inversion. So think
+ * REQ_META or REQ_SWAP. If we are already at qd == 1 and we're getting a lot
+ * of work done for us on behalf of the root cg and are being asked to scale
+ * down more then we induce a latency at userspace return. We accumulate the
+ * total amount of time we need to be punished by doing
+ *
+ * total_time += min_lat_nsec - actual_io_completion
+ *
+ * and then at throttle time will do
+ *
+ * throttle_time = min(total_time, NSEC_PER_SEC)
+ *
+ * This induced delay will throttle back the activity that is generating the
+ * root cg issued io's, wethere that's some metadata intensive operation or the
+ * group is using so much memory that it is pushing us into swap.
+ *
+ * Copyright (C) 2018 Josef Bacik
+ */
+#include <linux/kernel.h>
+#include <linux/blk_types.h>
+#include <linux/backing-dev.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/memcontrol.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/signal.h>
+#include <trace/events/block.h>
+#include "blk-rq-qos.h"
+#include "blk-stat.h"
+
+#define DEFAULT_SCALE_COOKIE 1000000U
+
+static struct blkcg_policy blkcg_policy_iolatency;
+struct iolatency_grp;
+
+struct blk_iolatency {
+ struct rq_qos rqos;
+ struct timer_list timer;
+ atomic_t enabled;
+};
+
+static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos)
+{
+ return container_of(rqos, struct blk_iolatency, rqos);
+}
+
+static inline bool blk_iolatency_enabled(struct blk_iolatency *blkiolat)
+{
+ return atomic_read(&blkiolat->enabled) > 0;
+}
+
+struct child_latency_info {
+ spinlock_t lock;
+
+ /* Last time we adjusted the scale of everybody. */
+ u64 last_scale_event;
+
+ /* The latency that we missed. */
+ u64 scale_lat;
+
+ /* Total io's from all of our children for the last summation. */
+ u64 nr_samples;
+
+ /* The guy who actually changed the latency numbers. */
+ struct iolatency_grp *scale_grp;
+
+ /* Cookie to tell if we need to scale up or down. */
+ atomic_t scale_cookie;
+};
+
+struct iolatency_grp {
+ struct blkg_policy_data pd;
+ struct blk_rq_stat __percpu *stats;
+ struct blk_iolatency *blkiolat;
+ struct rq_depth rq_depth;
+ struct rq_wait rq_wait;
+ atomic64_t window_start;
+ atomic_t scale_cookie;
+ u64 min_lat_nsec;
+ u64 cur_win_nsec;
+
+ /* total running average of our io latency. */
+ u64 lat_avg;
+
+ /* Our current number of IO's for the last summation. */
+ u64 nr_samples;
+
+ struct child_latency_info child_lat;
+};
+
+#define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC)
+#define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC
+/*
+ * These are the constants used to fake the fixed-point moving average
+ * calculation just like load average. The call to CALC_LOAD folds
+ * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling
+ * window size is bucketed to try to approximately calculate average
+ * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows
+ * elapse immediately. Note, windows only elapse with IO activity. Idle
+ * periods extend the most recent window.
+ */
+#define BLKIOLATENCY_NR_EXP_FACTORS 5
+#define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \
+ (BLKIOLATENCY_NR_EXP_FACTORS - 1))
+static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = {
+ 2045, // exp(1/600) - 600 samples
+ 2039, // exp(1/240) - 240 samples
+ 2031, // exp(1/120) - 120 samples
+ 2023, // exp(1/80) - 80 samples
+ 2014, // exp(1/60) - 60 samples
+};
+
+static inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd)
+{
+ return pd ? container_of(pd, struct iolatency_grp, pd) : NULL;
+}
+
+static inline struct iolatency_grp *blkg_to_lat(struct blkcg_gq *blkg)
+{
+ return pd_to_lat(blkg_to_pd(blkg, &blkcg_policy_iolatency));
+}
+
+static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat)
+{
+ return pd_to_blkg(&iolat->pd);
+}
+
+static inline bool iolatency_may_queue(struct iolatency_grp *iolat,
+ wait_queue_entry_t *wait,
+ bool first_block)
+{
+ struct rq_wait *rqw = &iolat->rq_wait;
+
+ if (first_block && waitqueue_active(&rqw->wait) &&
+ rqw->wait.head.next != &wait->entry)
+ return false;
+ return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth);
+}
+
+static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
+ struct iolatency_grp *iolat,
+ spinlock_t *lock, bool issue_as_root,
+ bool use_memdelay)
+ __releases(lock)
+ __acquires(lock)
+{
+ struct rq_wait *rqw = &iolat->rq_wait;
+ unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay);
+ DEFINE_WAIT(wait);
+ bool first_block = true;
+
+ if (use_delay)
+ blkcg_schedule_throttle(rqos->q, use_memdelay);
+
+ /*
+ * To avoid priority inversions we want to just take a slot if we are
+ * issuing as root. If we're being killed off there's no point in
+ * delaying things, we may have been killed by OOM so throttling may
+ * make recovery take even longer, so just let the IO's through so the
+ * task can go away.
+ */
+ if (issue_as_root || fatal_signal_pending(current)) {
+ atomic_inc(&rqw->inflight);
+ return;
+ }
+
+ if (iolatency_may_queue(iolat, &wait, first_block))
+ return;
+
+ do {
+ prepare_to_wait_exclusive(&rqw->wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ if (iolatency_may_queue(iolat, &wait, first_block))
+ break;
+ first_block = false;
+
+ if (lock) {
+ spin_unlock_irq(lock);
+ io_schedule();
+ spin_lock_irq(lock);
+ } else {
+ io_schedule();
+ }
+ } while (1);
+
+ finish_wait(&rqw->wait, &wait);
+}
+
+#define SCALE_DOWN_FACTOR 2
+#define SCALE_UP_FACTOR 4
+
+static inline unsigned long scale_amount(unsigned long qd, bool up)
+{
+ return max(up ? qd >> SCALE_UP_FACTOR : qd >> SCALE_DOWN_FACTOR, 1UL);
+}
+
+/*
+ * We scale the qd down faster than we scale up, so we need to use this helper
+ * to adjust the scale_cookie accordingly so we don't prematurely get
+ * scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much.
+ *
+ * Each group has their own local copy of the last scale cookie they saw, so if
+ * the global scale cookie goes up or down they know which way they need to go
+ * based on their last knowledge of it.
+ */
+static void scale_cookie_change(struct blk_iolatency *blkiolat,
+ struct child_latency_info *lat_info,
+ bool up)
+{
+ unsigned long qd = blk_queue_depth(blkiolat->rqos.q);
+ unsigned long scale = scale_amount(qd, up);
+ unsigned long old = atomic_read(&lat_info->scale_cookie);
+ unsigned long max_scale = qd << 1;
+ unsigned long diff = 0;
+
+ if (old < DEFAULT_SCALE_COOKIE)
+ diff = DEFAULT_SCALE_COOKIE - old;
+
+ if (up) {
+ if (scale + old > DEFAULT_SCALE_COOKIE)
+ atomic_set(&lat_info->scale_cookie,
+ DEFAULT_SCALE_COOKIE);
+ else if (diff > qd)
+ atomic_inc(&lat_info->scale_cookie);
+ else
+ atomic_add(scale, &lat_info->scale_cookie);
+ } else {
+ /*
+ * We don't want to dig a hole so deep that it takes us hours to
+ * dig out of it. Just enough that we don't throttle/unthrottle
+ * with jagged workloads but can still unthrottle once pressure
+ * has sufficiently dissipated.
+ */
+ if (diff > qd) {
+ if (diff < max_scale)
+ atomic_dec(&lat_info->scale_cookie);
+ } else {
+ atomic_sub(scale, &lat_info->scale_cookie);
+ }
+ }
+}
+
+/*
+ * Change the queue depth of the iolatency_grp. We add/subtract 1/16th of the
+ * queue depth at a time so we don't get wild swings and hopefully dial in to
+ * fairer distribution of the overall queue depth.
+ */
+static void scale_change(struct iolatency_grp *iolat, bool up)
+{
+ unsigned long qd = blk_queue_depth(iolat->blkiolat->rqos.q);
+ unsigned long scale = scale_amount(qd, up);
+ unsigned long old = iolat->rq_depth.max_depth;
+ bool changed = false;
+
+ if (old > qd)
+ old = qd;
+
+ if (up) {
+ if (old == 1 && blkcg_unuse_delay(lat_to_blkg(iolat)))
+ return;
+
+ if (old < qd) {
+ changed = true;
+ old += scale;
+ old = min(old, qd);
+ iolat->rq_depth.max_depth = old;
+ wake_up_all(&iolat->rq_wait.wait);
+ }
+ } else if (old > 1) {
+ old >>= 1;
+ changed = true;
+ iolat->rq_depth.max_depth = max(old, 1UL);
+ }
+}
+
+/* Check our parent and see if the scale cookie has changed. */
+static void check_scale_change(struct iolatency_grp *iolat)
+{
+ struct iolatency_grp *parent;
+ struct child_latency_info *lat_info;
+ unsigned int cur_cookie;
+ unsigned int our_cookie = atomic_read(&iolat->scale_cookie);
+ u64 scale_lat;
+ unsigned int old;
+ int direction = 0;
+
+ if (lat_to_blkg(iolat)->parent == NULL)
+ return;
+
+ parent = blkg_to_lat(lat_to_blkg(iolat)->parent);
+ if (!parent)
+ return;
+
+ lat_info = &parent->child_lat;
+ cur_cookie = atomic_read(&lat_info->scale_cookie);
+ scale_lat = READ_ONCE(lat_info->scale_lat);
+
+ if (cur_cookie < our_cookie)
+ direction = -1;
+ else if (cur_cookie > our_cookie)
+ direction = 1;
+ else
+ return;
+
+ old = atomic_cmpxchg(&iolat->scale_cookie, our_cookie, cur_cookie);
+
+ /* Somebody beat us to the punch, just bail. */
+ if (old != our_cookie)
+ return;
+
+ if (direction < 0 && iolat->min_lat_nsec) {
+ u64 samples_thresh;
+
+ if (!scale_lat || iolat->min_lat_nsec <= scale_lat)
+ return;
+
+ /*
+ * Sometimes high priority groups are their own worst enemy, so
+ * instead of taking it out on some poor other group that did 5%
+ * or less of the IO's for the last summation just skip this
+ * scale down event.
+ */
+ samples_thresh = lat_info->nr_samples * 5;
+ samples_thresh = div64_u64(samples_thresh, 100);
+ if (iolat->nr_samples <= samples_thresh)
+ return;
+ }
+
+ /* We're as low as we can go. */
+ if (iolat->rq_depth.max_depth == 1 && direction < 0) {
+ blkcg_use_delay(lat_to_blkg(iolat));
+ return;
+ }
+
+ /* We're back to the default cookie, unthrottle all the things. */
+ if (cur_cookie == DEFAULT_SCALE_COOKIE) {
+ blkcg_clear_delay(lat_to_blkg(iolat));
+ iolat->rq_depth.max_depth = UINT_MAX;
+ wake_up_all(&iolat->rq_wait.wait);
+ return;
+ }
+
+ scale_change(iolat, direction > 0);
+}
+
+static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio,
+ spinlock_t *lock)
+{
+ struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
+ struct blkcg *blkcg;
+ struct blkcg_gq *blkg;
+ struct request_queue *q = rqos->q;
+ bool issue_as_root = bio_issue_as_root_blkg(bio);
+
+ if (!blk_iolatency_enabled(blkiolat))
+ return;
+
+ rcu_read_lock();
+ blkcg = bio_blkcg(bio);
+ bio_associate_blkcg(bio, &blkcg->css);
+ blkg = blkg_lookup(blkcg, q);
+ if (unlikely(!blkg)) {
+ if (!lock)
+ spin_lock_irq(q->queue_lock);
+ blkg = blkg_lookup_create(blkcg, q);
+ if (IS_ERR(blkg))
+ blkg = NULL;
+ if (!lock)
+ spin_unlock_irq(q->queue_lock);
+ }
+ if (!blkg)
+ goto out;
+
+ bio_issue_init(&bio->bi_issue, bio_sectors(bio));
+ bio_associate_blkg(bio, blkg);
+out:
+ rcu_read_unlock();
+ while (blkg && blkg->parent) {
+ struct iolatency_grp *iolat = blkg_to_lat(blkg);
+ if (!iolat) {
+ blkg = blkg->parent;
+ continue;
+ }
+
+ check_scale_change(iolat);
+ __blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root,
+ (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
+ blkg = blkg->parent;
+ }
+ if (!timer_pending(&blkiolat->timer))
+ mod_timer(&blkiolat->timer, jiffies + HZ);
+}
+
+static void iolatency_record_time(struct iolatency_grp *iolat,
+ struct bio_issue *issue, u64 now,
+ bool issue_as_root)
+{
+ struct blk_rq_stat *rq_stat;
+ u64 start = bio_issue_time(issue);
+ u64 req_time;
+
+ /*
+ * Have to do this so we are truncated to the correct time that our
+ * issue is truncated to.
+ */
+ now = __bio_issue_time(now);
+
+ if (now <= start)
+ return;
+
+ req_time = now - start;
+
+ /*
+ * We don't want to count issue_as_root bio's in the cgroups latency
+ * statistics as it could skew the numbers downwards.
+ */
+ if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) {
+ u64 sub = iolat->min_lat_nsec;
+ if (req_time < sub)
+ blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time);
+ return;
+ }
+
+ rq_stat = get_cpu_ptr(iolat->stats);
+ blk_rq_stat_add(rq_stat, req_time);
+ put_cpu_ptr(rq_stat);
+}
+
+#define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC)
+#define BLKIOLATENCY_MIN_GOOD_SAMPLES 5
+
+static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
+{
+ struct blkcg_gq *blkg = lat_to_blkg(iolat);
+ struct iolatency_grp *parent;
+ struct child_latency_info *lat_info;
+ struct blk_rq_stat stat;
+ unsigned long flags;
+ int cpu, exp_idx;
+
+ blk_rq_stat_init(&stat);
+ preempt_disable();
+ for_each_online_cpu(cpu) {
+ struct blk_rq_stat *s;
+ s = per_cpu_ptr(iolat->stats, cpu);
+ blk_rq_stat_sum(&stat, s);
+ blk_rq_stat_init(s);
+ }
+ preempt_enable();
+
+ parent = blkg_to_lat(blkg->parent);
+ if (!parent)
+ return;
+
+ lat_info = &parent->child_lat;
+
+ /*
+ * CALC_LOAD takes in a number stored in fixed point representation.
+ * Because we are using this for IO time in ns, the values stored
+ * are significantly larger than the FIXED_1 denominator (2048).
+ * Therefore, rounding errors in the calculation are negligible and
+ * can be ignored.
+ */
+ exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
+ div64_u64(iolat->cur_win_nsec,
+ BLKIOLATENCY_EXP_BUCKET_SIZE));
+ CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean);
+
+ /* Everything is ok and we don't need to adjust the scale. */
+ if (stat.mean <= iolat->min_lat_nsec &&
+ atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE)
+ return;
+
+ /* Somebody beat us to the punch, just bail. */
+ spin_lock_irqsave(&lat_info->lock, flags);
+ lat_info->nr_samples -= iolat->nr_samples;
+ lat_info->nr_samples += stat.nr_samples;
+ iolat->nr_samples = stat.nr_samples;
+
+ if ((lat_info->last_scale_event >= now ||
+ now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) &&
+ lat_info->scale_lat <= iolat->min_lat_nsec)
+ goto out;
+
+ if (stat.mean <= iolat->min_lat_nsec &&
+ stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) {
+ if (lat_info->scale_grp == iolat) {
+ lat_info->last_scale_event = now;
+ scale_cookie_change(iolat->blkiolat, lat_info, true);
+ }
+ } else if (stat.mean > iolat->min_lat_nsec) {
+ lat_info->last_scale_event = now;
+ if (!lat_info->scale_grp ||
+ lat_info->scale_lat > iolat->min_lat_nsec) {
+ WRITE_ONCE(lat_info->scale_lat, iolat->min_lat_nsec);
+ lat_info->scale_grp = iolat;
+ }
+ scale_cookie_change(iolat->blkiolat, lat_info, false);
+ }
+out:
+ spin_unlock_irqrestore(&lat_info->lock, flags);
+}
+
+static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
+{
+ struct blkcg_gq *blkg;
+ struct rq_wait *rqw;
+ struct iolatency_grp *iolat;
+ u64 window_start;
+ u64 now = ktime_to_ns(ktime_get());
+ bool issue_as_root = bio_issue_as_root_blkg(bio);
+ bool enabled = false;
+
+ blkg = bio->bi_blkg;
+ if (!blkg)
+ return;
+
+ iolat = blkg_to_lat(bio->bi_blkg);
+ if (!iolat)
+ return;
+
+ enabled = blk_iolatency_enabled(iolat->blkiolat);
+ while (blkg && blkg->parent) {
+ iolat = blkg_to_lat(blkg);
+ if (!iolat) {
+ blkg = blkg->parent;
+ continue;
+ }
+ rqw = &iolat->rq_wait;
+
+ atomic_dec(&rqw->inflight);
+ if (!enabled || iolat->min_lat_nsec == 0)
+ goto next;
+ iolatency_record_time(iolat, &bio->bi_issue, now,
+ issue_as_root);
+ window_start = atomic64_read(&iolat->window_start);
+ if (now > window_start &&
+ (now - window_start) >= iolat->cur_win_nsec) {
+ if (atomic64_cmpxchg(&iolat->window_start,
+ window_start, now) == window_start)
+ iolatency_check_latencies(iolat, now);
+ }
+next:
+ wake_up(&rqw->wait);
+ blkg = blkg->parent;
+ }
+}
+
+static void blkcg_iolatency_cleanup(struct rq_qos *rqos, struct bio *bio)
+{
+ struct blkcg_gq *blkg;
+
+ blkg = bio->bi_blkg;
+ while (blkg && blkg->parent) {
+ struct rq_wait *rqw;
+ struct iolatency_grp *iolat;
+
+ iolat = blkg_to_lat(blkg);
+ if (!iolat)
+ goto next;
+
+ rqw = &iolat->rq_wait;
+ atomic_dec(&rqw->inflight);
+ wake_up(&rqw->wait);
+next:
+ blkg = blkg->parent;
+ }
+}
+
+static void blkcg_iolatency_exit(struct rq_qos *rqos)
+{
+ struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
+
+ del_timer_sync(&blkiolat->timer);
+ blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency);
+ kfree(blkiolat);
+}
+
+static struct rq_qos_ops blkcg_iolatency_ops = {
+ .throttle = blkcg_iolatency_throttle,
+ .cleanup = blkcg_iolatency_cleanup,
+ .done_bio = blkcg_iolatency_done_bio,
+ .exit = blkcg_iolatency_exit,
+};
+
+static void blkiolatency_timer_fn(struct timer_list *t)
+{
+ struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer);
+ struct blkcg_gq *blkg;
+ struct cgroup_subsys_state *pos_css;
+ u64 now = ktime_to_ns(ktime_get());
+
+ rcu_read_lock();
+ blkg_for_each_descendant_pre(blkg, pos_css,
+ blkiolat->rqos.q->root_blkg) {
+ struct iolatency_grp *iolat;
+ struct child_latency_info *lat_info;
+ unsigned long flags;
+ u64 cookie;
+
+ /*
+ * We could be exiting, don't access the pd unless we have a
+ * ref on the blkg.
+ */
+ if (!blkg_try_get(blkg))
+ continue;
+
+ iolat = blkg_to_lat(blkg);
+ if (!iolat)
+ goto next;
+
+ lat_info = &iolat->child_lat;
+ cookie = atomic_read(&lat_info->scale_cookie);
+
+ if (cookie >= DEFAULT_SCALE_COOKIE)
+ goto next;
+
+ spin_lock_irqsave(&lat_info->lock, flags);
+ if (lat_info->last_scale_event >= now)
+ goto next_lock;
+
+ /*
+ * We scaled down but don't have a scale_grp, scale up and carry
+ * on.
+ */
+ if (lat_info->scale_grp == NULL) {
+ scale_cookie_change(iolat->blkiolat, lat_info, true);
+ goto next_lock;
+ }
+
+ /*
+ * It's been 5 seconds since our last scale event, clear the
+ * scale grp in case the group that needed the scale down isn't
+ * doing any IO currently.
+ */
+ if (now - lat_info->last_scale_event >=
+ ((u64)NSEC_PER_SEC * 5))
+ lat_info->scale_grp = NULL;
+next_lock:
+ spin_unlock_irqrestore(&lat_info->lock, flags);
+next:
+ blkg_put(blkg);
+ }
+ rcu_read_unlock();
+}
+
+int blk_iolatency_init(struct request_queue *q)
+{
+ struct blk_iolatency *blkiolat;
+ struct rq_qos *rqos;
+ int ret;
+
+ blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL);
+ if (!blkiolat)
+ return -ENOMEM;
+
+ rqos = &blkiolat->rqos;
+ rqos->id = RQ_QOS_CGROUP;
+ rqos->ops = &blkcg_iolatency_ops;
+ rqos->q = q;
+
+ rq_qos_add(q, rqos);
+
+ ret = blkcg_activate_policy(q, &blkcg_policy_iolatency);
+ if (ret) {
+ rq_qos_del(q, rqos);
+ kfree(blkiolat);
+ return ret;
+ }
+
+ timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0);
+
+ return 0;
+}
+
+static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
+{
+ struct iolatency_grp *iolat = blkg_to_lat(blkg);
+ struct blk_iolatency *blkiolat = iolat->blkiolat;
+ u64 oldval = iolat->min_lat_nsec;
+
+ iolat->min_lat_nsec = val;
+ iolat->cur_win_nsec = max_t(u64, val << 4, BLKIOLATENCY_MIN_WIN_SIZE);
+ iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec,
+ BLKIOLATENCY_MAX_WIN_SIZE);
+
+ if (!oldval && val)
+ atomic_inc(&blkiolat->enabled);
+ if (oldval && !val)
+ atomic_dec(&blkiolat->enabled);
+}
+
+static void iolatency_clear_scaling(struct blkcg_gq *blkg)
+{
+ if (blkg->parent) {
+ struct iolatency_grp *iolat = blkg_to_lat(blkg->parent);
+ struct child_latency_info *lat_info;
+ if (!iolat)
+ return;
+
+ lat_info = &iolat->child_lat;
+ spin_lock(&lat_info->lock);
+ atomic_set(&lat_info->scale_cookie, DEFAULT_SCALE_COOKIE);
+ lat_info->last_scale_event = 0;
+ lat_info->scale_grp = NULL;
+ lat_info->scale_lat = 0;
+ spin_unlock(&lat_info->lock);
+ }
+}
+
+static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct blkcg *blkcg = css_to_blkcg(of_css(of));
+ struct blkcg_gq *blkg;
+ struct blk_iolatency *blkiolat;
+ struct blkg_conf_ctx ctx;
+ struct iolatency_grp *iolat;
+ char *p, *tok;
+ u64 lat_val = 0;
+ u64 oldval;
+ int ret;
+
+ ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx);
+ if (ret)
+ return ret;
+
+ iolat = blkg_to_lat(ctx.blkg);
+ blkiolat = iolat->blkiolat;
+ p = ctx.body;
+
+ ret = -EINVAL;
+ while ((tok = strsep(&p, " "))) {
+ char key[16];
+ char val[21]; /* 18446744073709551616 */
+
+ if (sscanf(tok, "%15[^=]=%20s", key, val) != 2)
+ goto out;
+
+ if (!strcmp(key, "target")) {
+ u64 v;
+
+ if (!strcmp(val, "max"))
+ lat_val = 0;
+ else if (sscanf(val, "%llu", &v) == 1)
+ lat_val = v * NSEC_PER_USEC;
+ else
+ goto out;
+ } else {
+ goto out;
+ }
+ }
+
+ /* Walk up the tree to see if our new val is lower than it should be. */
+ blkg = ctx.blkg;
+ oldval = iolat->min_lat_nsec;
+
+ iolatency_set_min_lat_nsec(blkg, lat_val);
+ if (oldval != iolat->min_lat_nsec) {
+ iolatency_clear_scaling(blkg);
+ }
+
+ ret = 0;
+out:
+ blkg_conf_finish(&ctx);
+ return ret ?: nbytes;
+}
+
+static u64 iolatency_prfill_limit(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct iolatency_grp *iolat = pd_to_lat(pd);
+ const char *dname = blkg_dev_name(pd->blkg);
+
+ if (!dname || !iolat->min_lat_nsec)
+ return 0;
+ seq_printf(sf, "%s target=%llu\n",
+ dname, div_u64(iolat->min_lat_nsec, NSEC_PER_USEC));
+ return 0;
+}
+
+static int iolatency_print_limit(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ iolatency_prfill_limit,
+ &blkcg_policy_iolatency, seq_cft(sf)->private, false);
+ return 0;
+}
+
+static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
+ size_t size)
+{
+ struct iolatency_grp *iolat = pd_to_lat(pd);
+ unsigned long long avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
+ unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
+
+ if (iolat->rq_depth.max_depth == UINT_MAX)
+ return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu",
+ avg_lat, cur_win);
+
+ return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu",
+ iolat->rq_depth.max_depth, avg_lat, cur_win);
+}
+
+
+static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node)
+{
+ struct iolatency_grp *iolat;
+
+ iolat = kzalloc_node(sizeof(*iolat), gfp, node);
+ if (!iolat)
+ return NULL;
+ iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat),
+ __alignof__(struct blk_rq_stat), gfp);
+ if (!iolat->stats) {
+ kfree(iolat);
+ return NULL;
+ }
+ return &iolat->pd;
+}
+
+static void iolatency_pd_init(struct blkg_policy_data *pd)
+{
+ struct iolatency_grp *iolat = pd_to_lat(pd);
+ struct blkcg_gq *blkg = lat_to_blkg(iolat);
+ struct rq_qos *rqos = blkcg_rq_qos(blkg->q);
+ struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
+ u64 now = ktime_to_ns(ktime_get());
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct blk_rq_stat *stat;
+ stat = per_cpu_ptr(iolat->stats, cpu);
+ blk_rq_stat_init(stat);
+ }
+
+ rq_wait_init(&iolat->rq_wait);
+ spin_lock_init(&iolat->child_lat.lock);
+ iolat->rq_depth.queue_depth = blk_queue_depth(blkg->q);
+ iolat->rq_depth.max_depth = UINT_MAX;
+ iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth;
+ iolat->blkiolat = blkiolat;
+ iolat->cur_win_nsec = 100 * NSEC_PER_MSEC;
+ atomic64_set(&iolat->window_start, now);
+
+ /*
+ * We init things in list order, so the pd for the parent may not be
+ * init'ed yet for whatever reason.
+ */
+ if (blkg->parent && blkg_to_pd(blkg->parent, &blkcg_policy_iolatency)) {
+ struct iolatency_grp *parent = blkg_to_lat(blkg->parent);
+ atomic_set(&iolat->scale_cookie,
+ atomic_read(&parent->child_lat.scale_cookie));
+ } else {
+ atomic_set(&iolat->scale_cookie, DEFAULT_SCALE_COOKIE);
+ }
+
+ atomic_set(&iolat->child_lat.scale_cookie, DEFAULT_SCALE_COOKIE);
+}
+
+static void iolatency_pd_offline(struct blkg_policy_data *pd)
+{
+ struct iolatency_grp *iolat = pd_to_lat(pd);
+ struct blkcg_gq *blkg = lat_to_blkg(iolat);
+
+ iolatency_set_min_lat_nsec(blkg, 0);
+ iolatency_clear_scaling(blkg);
+}
+
+static void iolatency_pd_free(struct blkg_policy_data *pd)
+{
+ struct iolatency_grp *iolat = pd_to_lat(pd);
+ free_percpu(iolat->stats);
+ kfree(iolat);
+}
+
+static struct cftype iolatency_files[] = {
+ {
+ .name = "latency",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = iolatency_print_limit,
+ .write = iolatency_set_limit,
+ },
+ {}
+};
+
+static struct blkcg_policy blkcg_policy_iolatency = {
+ .dfl_cftypes = iolatency_files,
+ .pd_alloc_fn = iolatency_pd_alloc,
+ .pd_init_fn = iolatency_pd_init,
+ .pd_offline_fn = iolatency_pd_offline,
+ .pd_free_fn = iolatency_pd_free,
+ .pd_stat_fn = iolatency_pd_stat,
+};
+
+static int __init iolatency_init(void)
+{
+ return blkcg_policy_register(&blkcg_policy_iolatency);
+}
+
+static void __exit iolatency_exit(void)
+{
+ return blkcg_policy_unregister(&blkcg_policy_iolatency);
+}
+
+module_init(iolatency_init);
+module_exit(iolatency_exit);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 8faa70f26fcd..d1b9dd03da25 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -68,6 +68,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
*/
req_sects = min_t(sector_t, nr_sects,
q->limits.max_discard_sectors);
+ if (!req_sects)
+ goto fail;
if (req_sects > UINT_MAX >> 9)
req_sects = UINT_MAX >> 9;
@@ -105,6 +107,14 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
*biop = bio;
return 0;
+
+fail:
+ if (bio) {
+ submit_bio_wait(bio);
+ bio_put(bio);
+ }
+ *biop = NULL;
+ return -EOPNOTSUPP;
}
EXPORT_SYMBOL(__blkdev_issue_discard);
diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c
new file mode 100644
index 000000000000..fb2c82c351e4
--- /dev/null
+++ b/block/blk-mq-debugfs-zoned.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2017 Western Digital Corporation or its affiliates.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/blkdev.h>
+#include "blk-mq-debugfs.h"
+
+int queue_zone_wlock_show(void *data, struct seq_file *m)
+{
+ struct request_queue *q = data;
+ unsigned int i;
+
+ if (!q->seq_zones_wlock)
+ return 0;
+
+ for (i = 0; i < q->nr_zones; i++)
+ if (test_bit(i, q->seq_zones_wlock))
+ seq_printf(m, "%u\n", i);
+
+ return 0;
+}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 1c4532e92938..cb1e6cf7ac48 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -206,21 +206,6 @@ static ssize_t queue_write_hint_store(void *data, const char __user *buf,
return count;
}
-static int queue_zone_wlock_show(void *data, struct seq_file *m)
-{
- struct request_queue *q = data;
- unsigned int i;
-
- if (!q->seq_zones_wlock)
- return 0;
-
- for (i = 0; i < blk_queue_nr_zones(q); i++)
- if (test_bit(i, q->seq_zones_wlock))
- seq_printf(m, "%u\n", i);
-
- return 0;
-}
-
static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
{ "poll_stat", 0400, queue_poll_stat_show },
{ "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
@@ -637,6 +622,14 @@ static int hctx_active_show(void *data, struct seq_file *m)
return 0;
}
+static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
+{
+ struct blk_mq_hw_ctx *hctx = data;
+
+ seq_printf(m, "%u\n", hctx->dispatch_busy);
+ return 0;
+}
+
static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos)
__acquires(&ctx->lock)
{
@@ -798,6 +791,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
{"queued", 0600, hctx_queued_show, hctx_queued_write},
{"run", 0600, hctx_run_show, hctx_run_write},
{"active", 0400, hctx_active_show},
+ {"dispatch_busy", 0400, hctx_dispatch_busy_show},
{},
};
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index b9d366e57097..a9160be12be0 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -80,4 +80,13 @@ static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hc
}
#endif
+#ifdef CONFIG_BLK_DEBUG_FS_ZONED
+int queue_zone_wlock_show(void *data, struct seq_file *m);
+#else
+static inline int queue_zone_wlock_show(void *data, struct seq_file *m)
+{
+ return 0;
+}
+#endif
+
#endif
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
index e233996bb76f..db644ec624f5 100644
--- a/block/blk-mq-pci.c
+++ b/block/blk-mq-pci.c
@@ -17,6 +17,8 @@
#include <linux/pci.h>
#include <linux/module.h>
+#include "blk-mq.h"
+
/**
* blk_mq_pci_map_queues - provide a default queue mapping for PCI device
* @set: tagset to provide the mapping for
@@ -48,8 +50,7 @@ int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev,
fallback:
WARN_ON_ONCE(set->nr_hw_queues > 1);
- for_each_possible_cpu(cpu)
- set->mq_map[cpu] = 0;
+ blk_mq_clear_mq_map(set);
return 0;
}
EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 56c493c6cd90..cf9c66c6d35a 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -59,29 +59,16 @@ static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
return;
- if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
- struct request_queue *q = hctx->queue;
-
- if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
- atomic_inc(&q->shared_hctx_restart);
- } else
- set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+ set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
}
-static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
+void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
- return false;
-
- if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
- struct request_queue *q = hctx->queue;
-
- if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
- atomic_dec(&q->shared_hctx_restart);
- } else
- clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+ return;
+ clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
- return blk_mq_run_hw_queue(hctx, true);
+ blk_mq_run_hw_queue(hctx, true);
}
/*
@@ -219,15 +206,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
}
} else if (has_sched_dispatch) {
blk_mq_do_dispatch_sched(hctx);
- } else if (q->mq_ops->get_budget) {
- /*
- * If we need to get budget before queuing request, we
- * dequeue request one by one from sw queue for avoiding
- * to mess up I/O merge when dispatch runs out of resource.
- *
- * TODO: get more budgets, and dequeue more requests in
- * one time.
- */
+ } else if (hctx->dispatch_busy) {
+ /* dequeue request one by one from sw queue if queue is busy */
blk_mq_do_dispatch_ctx(hctx);
} else {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
@@ -339,7 +319,8 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
return e->type->ops.mq.bio_merge(hctx, bio);
}
- if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) {
+ if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
+ !list_empty_careful(&ctx->rq_list)) {
/* default per sw-queue merge */
spin_lock(&ctx->lock);
ret = blk_mq_attempt_merge(q, ctx, bio);
@@ -380,68 +361,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
return false;
}
-/**
- * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
- * @pos: loop cursor.
- * @skip: the list element that will not be examined. Iteration starts at
- * @skip->next.
- * @head: head of the list to examine. This list must have at least one
- * element, namely @skip.
- * @member: name of the list_head structure within typeof(*pos).
- */
-#define list_for_each_entry_rcu_rr(pos, skip, head, member) \
- for ((pos) = (skip); \
- (pos = (pos)->member.next != (head) ? list_entry_rcu( \
- (pos)->member.next, typeof(*pos), member) : \
- list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
- (pos) != (skip); )
-
-/*
- * Called after a driver tag has been freed to check whether a hctx needs to
- * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
- * queues in a round-robin fashion if the tag set of @hctx is shared with other
- * hardware queues.
- */
-void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
-{
- struct blk_mq_tags *const tags = hctx->tags;
- struct blk_mq_tag_set *const set = hctx->queue->tag_set;
- struct request_queue *const queue = hctx->queue, *q;
- struct blk_mq_hw_ctx *hctx2;
- unsigned int i, j;
-
- if (set->flags & BLK_MQ_F_TAG_SHARED) {
- /*
- * If this is 0, then we know that no hardware queues
- * have RESTART marked. We're done.
- */
- if (!atomic_read(&queue->shared_hctx_restart))
- return;
-
- rcu_read_lock();
- list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
- tag_set_list) {
- queue_for_each_hw_ctx(q, hctx2, i)
- if (hctx2->tags == tags &&
- blk_mq_sched_restart_hctx(hctx2))
- goto done;
- }
- j = hctx->queue_num + 1;
- for (i = 0; i < queue->nr_hw_queues; i++, j++) {
- if (j == queue->nr_hw_queues)
- j = 0;
- hctx2 = queue->queue_hw_ctx[j];
- if (hctx2->tags == tags &&
- blk_mq_sched_restart_hctx(hctx2))
- break;
- }
-done:
- rcu_read_unlock();
- } else {
- blk_mq_sched_restart_hctx(hctx);
- }
-}
-
void blk_mq_sched_insert_request(struct request *rq, bool at_head,
bool run_queue, bool async)
{
@@ -486,8 +405,19 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
if (e && e->type->ops.mq.insert_requests)
e->type->ops.mq.insert_requests(hctx, list, false);
- else
+ else {
+ /*
+ * try to issue requests directly if the hw queue isn't
+ * busy in case of 'none' scheduler, and this way may save
+ * us one extra enqueue & dequeue to sw queue.
+ */
+ if (!hctx->dispatch_busy && !e && !run_queue_async) {
+ blk_mq_try_issue_list_directly(hctx, list);
+ if (list_empty(list))
+ return;
+ }
blk_mq_insert_requests(hctx, ctx, list);
+ }
blk_mq_run_hw_queue(hctx, run_queue_async);
}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 3de0836163c2..816923bf874d 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -23,6 +23,9 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
/*
* If a previously inactive queue goes active, bump the active user count.
+ * We need to do this before try to allocate driver tag, then even if fail
+ * to get tag when first time, the other shared-tag users could reserve
+ * budget for it.
*/
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
@@ -399,8 +402,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
if (tdepth <= tags->nr_reserved_tags)
return -EINVAL;
- tdepth -= tags->nr_reserved_tags;
-
/*
* If we are allowed to grow beyond the original size, allocate
* a new set of tags before freeing the old one.
@@ -420,7 +421,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
if (tdepth > 16 * BLKDEV_MAX_RQ)
return -EINVAL;
- new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 0);
+ new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
+ tags->nr_reserved_tags);
if (!new)
return -ENOMEM;
ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
@@ -437,7 +439,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
* Don't need (or can't) update reserved tags here, they
* remain static and should never need resizing.
*/
- sbitmap_queue_resize(&tags->bitmap_tags, tdepth);
+ sbitmap_queue_resize(&tags->bitmap_tags,
+ tdepth - tags->nr_reserved_tags);
}
return 0;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 654b0dc7e001..72a0033ccee9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -34,8 +34,8 @@
#include "blk-mq-debugfs.h"
#include "blk-mq-tag.h"
#include "blk-stat.h"
-#include "blk-wbt.h"
#include "blk-mq-sched.h"
+#include "blk-rq-qos.h"
static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
static void blk_mq_poll_stats_start(struct request_queue *q);
@@ -285,7 +285,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
rq->tag = -1;
rq->internal_tag = tag;
} else {
- if (blk_mq_tag_busy(data->hctx)) {
+ if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
rq_flags = RQF_MQ_INFLIGHT;
atomic_inc(&data->hctx->nr_active);
}
@@ -367,6 +367,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
!(data->flags & BLK_MQ_REQ_RESERVED))
e->type->ops.mq.limit_depth(op, data);
+ } else {
+ blk_mq_tag_busy(data->hctx);
}
tag = blk_mq_get_tag(data);
@@ -504,7 +506,7 @@ void blk_mq_free_request(struct request *rq)
if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
laptop_io_completion(q->backing_dev_info);
- wbt_done(q->rq_wb, rq);
+ rq_qos_done(q, rq);
if (blk_rq_rl(rq))
blk_put_rl(blk_rq_rl(rq));
@@ -527,7 +529,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
blk_account_io_done(rq, now);
if (rq->end_io) {
- wbt_done(rq->q->rq_wb, rq);
+ rq_qos_done(rq->q, rq);
rq->end_io(rq, error);
} else {
if (unlikely(blk_bidi_rq(rq)))
@@ -639,7 +641,7 @@ void blk_mq_start_request(struct request *rq)
rq->throtl_size = blk_rq_sectors(rq);
#endif
rq->rq_flags |= RQF_STATS;
- wbt_issue(q->rq_wb, rq);
+ rq_qos_issue(q, rq);
}
WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
@@ -665,7 +667,7 @@ static void __blk_mq_requeue_request(struct request *rq)
blk_mq_put_driver_tag(rq);
trace_block_rq_requeue(q, rq);
- wbt_requeue(q->rq_wb, rq);
+ rq_qos_requeue(q, rq);
if (blk_mq_request_started(rq)) {
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
@@ -962,16 +964,14 @@ static inline unsigned int queued_to_index(unsigned int queued)
return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
}
-bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
- bool wait)
+bool blk_mq_get_driver_tag(struct request *rq)
{
struct blk_mq_alloc_data data = {
.q = rq->q,
.hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
- .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
+ .flags = BLK_MQ_REQ_NOWAIT,
};
-
- might_sleep_if(wait);
+ bool shared;
if (rq->tag != -1)
goto done;
@@ -979,9 +979,10 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
data.flags |= BLK_MQ_REQ_RESERVED;
+ shared = blk_mq_tag_busy(data.hctx);
rq->tag = blk_mq_get_tag(&data);
if (rq->tag >= 0) {
- if (blk_mq_tag_busy(data.hctx)) {
+ if (shared) {
rq->rq_flags |= RQF_MQ_INFLIGHT;
atomic_inc(&data.hctx->nr_active);
}
@@ -989,8 +990,6 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
}
done:
- if (hctx)
- *hctx = data.hctx;
return rq->tag != -1;
}
@@ -1001,7 +1000,10 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
+ spin_lock(&hctx->dispatch_wait_lock);
list_del_init(&wait->entry);
+ spin_unlock(&hctx->dispatch_wait_lock);
+
blk_mq_run_hw_queue(hctx, true);
return 1;
}
@@ -1012,17 +1014,16 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
* restart. For both cases, take care to check the condition again after
* marking us as waiting.
*/
-static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
+static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
- struct blk_mq_hw_ctx *this_hctx = *hctx;
- struct sbq_wait_state *ws;
+ struct wait_queue_head *wq;
wait_queue_entry_t *wait;
bool ret;
- if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) {
- if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
- set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
+ if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
+ if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+ set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
/*
* It's possible that a tag was freed in the window between the
@@ -1032,30 +1033,35 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
* Don't clear RESTART here, someone else could have set it.
* At most this will cost an extra queue run.
*/
- return blk_mq_get_driver_tag(rq, hctx, false);
+ return blk_mq_get_driver_tag(rq);
}
- wait = &this_hctx->dispatch_wait;
+ wait = &hctx->dispatch_wait;
if (!list_empty_careful(&wait->entry))
return false;
- spin_lock(&this_hctx->lock);
+ wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait;
+
+ spin_lock_irq(&wq->lock);
+ spin_lock(&hctx->dispatch_wait_lock);
if (!list_empty(&wait->entry)) {
- spin_unlock(&this_hctx->lock);
+ spin_unlock(&hctx->dispatch_wait_lock);
+ spin_unlock_irq(&wq->lock);
return false;
}
- ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
- add_wait_queue(&ws->wait, wait);
+ wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+ __add_wait_queue(wq, wait);
/*
* It's possible that a tag was freed in the window between the
* allocation failure and adding the hardware queue to the wait
* queue.
*/
- ret = blk_mq_get_driver_tag(rq, hctx, false);
+ ret = blk_mq_get_driver_tag(rq);
if (!ret) {
- spin_unlock(&this_hctx->lock);
+ spin_unlock(&hctx->dispatch_wait_lock);
+ spin_unlock_irq(&wq->lock);
return false;
}
@@ -1063,14 +1069,42 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
* We got a tag, remove ourselves from the wait queue to ensure
* someone else gets the wakeup.
*/
- spin_lock_irq(&ws->wait.lock);
list_del_init(&wait->entry);
- spin_unlock_irq(&ws->wait.lock);
- spin_unlock(&this_hctx->lock);
+ spin_unlock(&hctx->dispatch_wait_lock);
+ spin_unlock_irq(&wq->lock);
return true;
}
+#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8
+#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4
+/*
+ * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
+ * - EWMA is one simple way to compute running average value
+ * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
+ * - take 4 as factor for avoiding to get too small(0) result, and this
+ * factor doesn't matter because EWMA decreases exponentially
+ */
+static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
+{
+ unsigned int ewma;
+
+ if (hctx->queue->elevator)
+ return;
+
+ ewma = hctx->dispatch_busy;
+
+ if (!ewma && !busy)
+ return;
+
+ ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
+ if (busy)
+ ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
+ ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
+
+ hctx->dispatch_busy = ewma;
+}
+
#define BLK_MQ_RESOURCE_DELAY 3 /* ms units */
/*
@@ -1103,7 +1137,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
break;
- if (!blk_mq_get_driver_tag(rq, NULL, false)) {
+ if (!blk_mq_get_driver_tag(rq)) {
/*
* The initial allocation attempt failed, so we need to
* rerun the hardware queue when a tag is freed. The
@@ -1111,7 +1145,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
* before we add this entry back on the dispatch list,
* we'll re-run it below.
*/
- if (!blk_mq_mark_tag_wait(&hctx, rq)) {
+ if (!blk_mq_mark_tag_wait(hctx, rq)) {
blk_mq_put_dispatch_budget(hctx);
/*
* For non-shared tags, the RESTART check
@@ -1135,7 +1169,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
bd.last = true;
else {
nxt = list_first_entry(list, struct request, queuelist);
- bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
+ bd.last = !blk_mq_get_driver_tag(nxt);
}
ret = q->mq_ops->queue_rq(hctx, &bd);
@@ -1207,8 +1241,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
else if (needs_restart && (ret == BLK_STS_RESOURCE))
blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
+ blk_mq_update_dispatch_busy(hctx, true);
return false;
- }
+ } else
+ blk_mq_update_dispatch_busy(hctx, false);
/*
* If the host/device is unable to accept more work, inform the
@@ -1542,19 +1578,19 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
struct list_head *list)
{
+ struct request *rq;
+
/*
* preemption doesn't flush plug list, so it's possible ctx->cpu is
* offline now
*/
- spin_lock(&ctx->lock);
- while (!list_empty(list)) {
- struct request *rq;
-
- rq = list_first_entry(list, struct request, queuelist);
+ list_for_each_entry(rq, list, queuelist) {
BUG_ON(rq->mq_ctx != ctx);
- list_del_init(&rq->queuelist);
- __blk_mq_insert_req_list(hctx, rq, false);
+ trace_block_rq_insert(hctx->queue, rq);
}
+
+ spin_lock(&ctx->lock);
+ list_splice_tail_init(list, &ctx->rq_list);
blk_mq_hctx_mark_pending(hctx, ctx);
spin_unlock(&ctx->lock);
}
@@ -1657,13 +1693,16 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) {
case BLK_STS_OK:
+ blk_mq_update_dispatch_busy(hctx, false);
*cookie = new_cookie;
break;
case BLK_STS_RESOURCE:
case BLK_STS_DEV_RESOURCE:
+ blk_mq_update_dispatch_busy(hctx, true);
__blk_mq_requeue_request(rq);
break;
default:
+ blk_mq_update_dispatch_busy(hctx, false);
*cookie = BLK_QC_T_NONE;
break;
}
@@ -1698,7 +1737,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
if (!blk_mq_get_dispatch_budget(hctx))
goto insert;
- if (!blk_mq_get_driver_tag(rq, NULL, false)) {
+ if (!blk_mq_get_driver_tag(rq)) {
blk_mq_put_dispatch_budget(hctx);
goto insert;
}
@@ -1746,6 +1785,27 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq)
return ret;
}
+void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
+ struct list_head *list)
+{
+ while (!list_empty(list)) {
+ blk_status_t ret;
+ struct request *rq = list_first_entry(list, struct request,
+ queuelist);
+
+ list_del_init(&rq->queuelist);
+ ret = blk_mq_request_issue_directly(rq);
+ if (ret != BLK_STS_OK) {
+ if (ret == BLK_STS_RESOURCE ||
+ ret == BLK_STS_DEV_RESOURCE) {
+ list_add(&rq->queuelist, list);
+ break;
+ }
+ blk_mq_end_request(rq, ret);
+ }
+ }
+}
+
static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
{
const int is_sync = op_is_sync(bio->bi_opf);
@@ -1756,7 +1816,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
struct blk_plug *plug;
struct request *same_queue_rq = NULL;
blk_qc_t cookie;
- unsigned int wb_acct;
blk_queue_bounce(q, &bio);
@@ -1772,19 +1831,19 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
if (blk_mq_sched_bio_merge(q, bio))
return BLK_QC_T_NONE;
- wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+ rq_qos_throttle(q, bio, NULL);
trace_block_getrq(q, bio, bio->bi_opf);
rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
if (unlikely(!rq)) {
- __wbt_done(q->rq_wb, wb_acct);
+ rq_qos_cleanup(q, bio);
if (bio->bi_opf & REQ_NOWAIT)
bio_wouldblock_error(bio);
return BLK_QC_T_NONE;
}
- wbt_track(rq, wb_acct);
+ rq_qos_track(q, rq, bio);
cookie = request_to_qc_t(data.hctx, rq);
@@ -1847,7 +1906,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_mq_try_issue_directly(data.hctx, same_queue_rq,
&cookie);
}
- } else if (q->nr_hw_queues > 1 && is_sync) {
+ } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
+ !data.hctx->dispatch_busy)) {
blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
blk_mq_try_issue_directly(data.hctx, rq, &cookie);
@@ -2146,6 +2206,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
hctx->nr_ctx = 0;
+ spin_lock_init(&hctx->dispatch_wait_lock);
init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
@@ -2331,15 +2392,10 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared)
int i;
queue_for_each_hw_ctx(q, hctx, i) {
- if (shared) {
- if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
- atomic_inc(&q->shared_hctx_restart);
+ if (shared)
hctx->flags |= BLK_MQ_F_TAG_SHARED;
- } else {
- if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
- atomic_dec(&q->shared_hctx_restart);
+ else
hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
- }
}
}
@@ -2370,7 +2426,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
blk_mq_update_tag_set_depth(set, false);
}
mutex_unlock(&set->tag_list_lock);
- synchronize_rcu();
INIT_LIST_HEAD(&q->tag_set_list);
}
@@ -2685,7 +2740,6 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
{
if (set->ops->map_queues) {
- int cpu;
/*
* transport .map_queues is usually done in the following
* way:
@@ -2700,8 +2754,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
* killing stale mapping since one CPU may not be mapped
* to any hw queue.
*/
- for_each_possible_cpu(cpu)
- set->mq_map[cpu] = 0;
+ blk_mq_clear_mq_map(set);
return set->ops->map_queues(set);
} else
@@ -2711,7 +2764,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
/*
* Alloc a tag set to be associated with one or more request queues.
* May fail with EINVAL for various error conditions. May adjust the
- * requested depth down, if if it too large. In that case, the set
+ * requested depth down, if it's too large. In that case, the set
* value will be stored in set->queue_depth.
*/
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 89231e439b2f..9497b47e2526 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -36,8 +36,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
-bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
- bool wait);
+bool blk_mq_get_driver_tag(struct request *rq);
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *start);
@@ -65,6 +64,8 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
/* Used by blk_insert_cloned_request() to issue request directly */
blk_status_t blk_mq_request_issue_directly(struct request *rq);
+void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
+ struct list_head *list);
/*
* CPU -> queue mappings
@@ -203,4 +204,12 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
__blk_mq_put_driver_tag(hctx, rq);
}
+static inline void blk_mq_clear_mq_map(struct blk_mq_tag_set *set)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ set->mq_map[cpu] = 0;
+}
+
#endif
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
new file mode 100644
index 000000000000..0005dfd568dd
--- /dev/null
+++ b/block/blk-rq-qos.c
@@ -0,0 +1,194 @@
+#include "blk-rq-qos.h"
+
+/*
+ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
+ * false if 'v' + 1 would be bigger than 'below'.
+ */
+static bool atomic_inc_below(atomic_t *v, unsigned int below)
+{
+ unsigned int cur = atomic_read(v);
+
+ for (;;) {
+ unsigned int old;
+
+ if (cur >= below)
+ return false;
+ old = atomic_cmpxchg(v, cur, cur + 1);
+ if (old == cur)
+ break;
+ cur = old;
+ }
+
+ return true;
+}
+
+bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit)
+{
+ return atomic_inc_below(&rq_wait->inflight, limit);
+}
+
+void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
+{
+ struct rq_qos *rqos;
+
+ for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
+ if (rqos->ops->cleanup)
+ rqos->ops->cleanup(rqos, bio);
+ }
+}
+
+void rq_qos_done(struct request_queue *q, struct request *rq)
+{
+ struct rq_qos *rqos;
+
+ for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
+ if (rqos->ops->done)
+ rqos->ops->done(rqos, rq);
+ }
+}
+
+void rq_qos_issue(struct request_queue *q, struct request *rq)
+{
+ struct rq_qos *rqos;
+
+ for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+ if (rqos->ops->issue)
+ rqos->ops->issue(rqos, rq);
+ }
+}
+
+void rq_qos_requeue(struct request_queue *q, struct request *rq)
+{
+ struct rq_qos *rqos;
+
+ for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+ if (rqos->ops->requeue)
+ rqos->ops->requeue(rqos, rq);
+ }
+}
+
+void rq_qos_throttle(struct request_queue *q, struct bio *bio,
+ spinlock_t *lock)
+{
+ struct rq_qos *rqos;
+
+ for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+ if (rqos->ops->throttle)
+ rqos->ops->throttle(rqos, bio, lock);
+ }
+}
+
+void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio)
+{
+ struct rq_qos *rqos;
+
+ for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+ if (rqos->ops->track)
+ rqos->ops->track(rqos, rq, bio);
+ }
+}
+
+void rq_qos_done_bio(struct request_queue *q, struct bio *bio)
+{
+ struct rq_qos *rqos;
+
+ for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+ if (rqos->ops->done_bio)
+ rqos->ops->done_bio(rqos, bio);
+ }
+}
+
+/*
+ * Return true, if we can't increase the depth further by scaling
+ */
+bool rq_depth_calc_max_depth(struct rq_depth *rqd)
+{
+ unsigned int depth;
+ bool ret = false;
+
+ /*
+ * For QD=1 devices, this is a special case. It's important for those
+ * to have one request ready when one completes, so force a depth of
+ * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
+ * since the device can't have more than that in flight. If we're
+ * scaling down, then keep a setting of 1/1/1.
+ */
+ if (rqd->queue_depth == 1) {
+ if (rqd->scale_step > 0)
+ rqd->max_depth = 1;
+ else {
+ rqd->max_depth = 2;
+ ret = true;
+ }
+ } else {
+ /*
+ * scale_step == 0 is our default state. If we have suffered
+ * latency spikes, step will be > 0, and we shrink the
+ * allowed write depths. If step is < 0, we're only doing
+ * writes, and we allow a temporarily higher depth to
+ * increase performance.
+ */
+ depth = min_t(unsigned int, rqd->default_depth,
+ rqd->queue_depth);
+ if (rqd->scale_step > 0)
+ depth = 1 + ((depth - 1) >> min(31, rqd->scale_step));
+ else if (rqd->scale_step < 0) {
+ unsigned int maxd = 3 * rqd->queue_depth / 4;
+
+ depth = 1 + ((depth - 1) << -rqd->scale_step);
+ if (depth > maxd) {
+ depth = maxd;
+ ret = true;
+ }
+ }
+
+ rqd->max_depth = depth;
+ }
+
+ return ret;
+}
+
+void rq_depth_scale_up(struct rq_depth *rqd)
+{
+ /*
+ * Hit max in previous round, stop here
+ */
+ if (rqd->scaled_max)
+ return;
+
+ rqd->scale_step--;
+
+ rqd->scaled_max = rq_depth_calc_max_depth(rqd);
+}
+
+/*
+ * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
+ * had a latency violation.
+ */
+void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
+{
+ /*
+ * Stop scaling down when we've hit the limit. This also prevents
+ * ->scale_step from going to crazy values, if the device can't
+ * keep up.
+ */
+ if (rqd->max_depth == 1)
+ return;
+
+ if (rqd->scale_step < 0 && hard_throttle)
+ rqd->scale_step = 0;
+ else
+ rqd->scale_step++;
+
+ rqd->scaled_max = false;
+ rq_depth_calc_max_depth(rqd);
+}
+
+void rq_qos_exit(struct request_queue *q)
+{
+ while (q->rq_qos) {
+ struct rq_qos *rqos = q->rq_qos;
+ q->rq_qos = rqos->next;
+ rqos->ops->exit(rqos);
+ }
+}
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
new file mode 100644
index 000000000000..32b02efbfa66
--- /dev/null
+++ b/block/blk-rq-qos.h
@@ -0,0 +1,109 @@
+#ifndef RQ_QOS_H
+#define RQ_QOS_H
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blk_types.h>
+#include <linux/atomic.h>
+#include <linux/wait.h>
+
+enum rq_qos_id {
+ RQ_QOS_WBT,
+ RQ_QOS_CGROUP,
+};
+
+struct rq_wait {
+ wait_queue_head_t wait;
+ atomic_t inflight;
+};
+
+struct rq_qos {
+ struct rq_qos_ops *ops;
+ struct request_queue *q;
+ enum rq_qos_id id;
+ struct rq_qos *next;
+};
+
+struct rq_qos_ops {
+ void (*throttle)(struct rq_qos *, struct bio *, spinlock_t *);
+ void (*track)(struct rq_qos *, struct request *, struct bio *);
+ void (*issue)(struct rq_qos *, struct request *);
+ void (*requeue)(struct rq_qos *, struct request *);
+ void (*done)(struct rq_qos *, struct request *);
+ void (*done_bio)(struct rq_qos *, struct bio *);
+ void (*cleanup)(struct rq_qos *, struct bio *);
+ void (*exit)(struct rq_qos *);
+};
+
+struct rq_depth {
+ unsigned int max_depth;
+
+ int scale_step;
+ bool scaled_max;
+
+ unsigned int queue_depth;
+ unsigned int default_depth;
+};
+
+static inline struct rq_qos *rq_qos_id(struct request_queue *q,
+ enum rq_qos_id id)
+{
+ struct rq_qos *rqos;
+ for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
+ if (rqos->id == id)
+ break;
+ }
+ return rqos;
+}
+
+static inline struct rq_qos *wbt_rq_qos(struct request_queue *q)
+{
+ return rq_qos_id(q, RQ_QOS_WBT);
+}
+
+static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q)
+{
+ return rq_qos_id(q, RQ_QOS_CGROUP);
+}
+
+static inline void rq_wait_init(struct rq_wait *rq_wait)
+{
+ atomic_set(&rq_wait->inflight, 0);
+ init_waitqueue_head(&rq_wait->wait);
+}
+
+static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
+{
+ rqos->next = q->rq_qos;
+ q->rq_qos = rqos;
+}
+
+static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
+{
+ struct rq_qos *cur, *prev = NULL;
+ for (cur = q->rq_qos; cur; cur = cur->next) {
+ if (cur == rqos) {
+ if (prev)
+ prev->next = rqos->next;
+ else
+ q->rq_qos = cur;
+ break;
+ }
+ prev = cur;
+ }
+}
+
+bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit);
+void rq_depth_scale_up(struct rq_depth *rqd);
+void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle);
+bool rq_depth_calc_max_depth(struct rq_depth *rqd);
+
+void rq_qos_cleanup(struct request_queue *, struct bio *);
+void rq_qos_done(struct request_queue *, struct request *);
+void rq_qos_issue(struct request_queue *, struct request *);
+void rq_qos_requeue(struct request_queue *, struct request *);
+void rq_qos_done_bio(struct request_queue *q, struct bio *bio);
+void rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *);
+void rq_qos_track(struct request_queue *q, struct request *, struct bio *);
+void rq_qos_exit(struct request_queue *);
+#endif
diff --git a/block/blk-settings.c b/block/blk-settings.c
index d1de71124656..ffd459969689 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -128,7 +128,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
/* Inherit limits from component devices */
lim->max_segments = USHRT_MAX;
- lim->max_discard_segments = 1;
+ lim->max_discard_segments = USHRT_MAX;
lim->max_hw_sectors = UINT_MAX;
lim->max_segment_size = UINT_MAX;
lim->max_sectors = UINT_MAX;
@@ -875,7 +875,7 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
{
q->queue_depth = depth;
- wbt_set_queue_depth(q->rq_wb, depth);
+ wbt_set_queue_depth(q, depth);
}
EXPORT_SYMBOL(blk_set_queue_depth);
@@ -900,7 +900,7 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
queue_flag_clear(QUEUE_FLAG_FUA, q);
spin_unlock_irq(q->queue_lock);
- wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
+ wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
}
EXPORT_SYMBOL_GPL(blk_queue_write_cache);
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 175c143ac5b9..7587b1c3caaf 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -17,7 +17,7 @@ struct blk_queue_stats {
bool enable_accounting;
};
-static void blk_stat_init(struct blk_rq_stat *stat)
+void blk_rq_stat_init(struct blk_rq_stat *stat)
{
stat->min = -1ULL;
stat->max = stat->nr_samples = stat->mean = 0;
@@ -25,7 +25,7 @@ static void blk_stat_init(struct blk_rq_stat *stat)
}
/* src is a per-cpu stat, mean isn't initialized */
-static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
+void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
{
if (!src->nr_samples)
return;
@@ -39,7 +39,7 @@ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
dst->nr_samples += src->nr_samples;
}
-static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
+void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value)
{
stat->min = min(stat->min, value);
stat->max = max(stat->max, value);
@@ -69,7 +69,7 @@ void blk_stat_add(struct request *rq, u64 now)
continue;
stat = &get_cpu_ptr(cb->cpu_stat)[bucket];
- __blk_stat_add(stat, value);
+ blk_rq_stat_add(stat, value);
put_cpu_ptr(cb->cpu_stat);
}
rcu_read_unlock();
@@ -82,15 +82,15 @@ static void blk_stat_timer_fn(struct timer_list *t)
int cpu;
for (bucket = 0; bucket < cb->buckets; bucket++)
- blk_stat_init(&cb->stat[bucket]);
+ blk_rq_stat_init(&cb->stat[bucket]);
for_each_online_cpu(cpu) {
struct blk_rq_stat *cpu_stat;
cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
for (bucket = 0; bucket < cb->buckets; bucket++) {
- blk_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]);
- blk_stat_init(&cpu_stat[bucket]);
+ blk_rq_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]);
+ blk_rq_stat_init(&cpu_stat[bucket]);
}
}
@@ -143,7 +143,7 @@ void blk_stat_add_callback(struct request_queue *q,
cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
for (bucket = 0; bucket < cb->buckets; bucket++)
- blk_stat_init(&cpu_stat[bucket]);
+ blk_rq_stat_init(&cpu_stat[bucket]);
}
spin_lock(&q->stats->lock);
diff --git a/block/blk-stat.h b/block/blk-stat.h
index 78399cdde9c9..f4a1568e81a4 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -159,4 +159,8 @@ static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb,
mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs));
}
+void blk_rq_stat_add(struct blk_rq_stat *, u64);
+void blk_rq_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *);
+void blk_rq_stat_init(struct blk_rq_stat *);
+
#endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 94987b1f69e1..bb109bb0a055 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -422,16 +422,16 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
{
- if (!q->rq_wb)
+ if (!wbt_rq_qos(q))
return -EINVAL;
- return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000));
+ return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000));
}
static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
size_t count)
{
- struct rq_wb *rwb;
+ struct rq_qos *rqos;
ssize_t ret;
s64 val;
@@ -441,23 +441,21 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
if (val < -1)
return -EINVAL;
- rwb = q->rq_wb;
- if (!rwb) {
+ rqos = wbt_rq_qos(q);
+ if (!rqos) {
ret = wbt_init(q);
if (ret)
return ret;
}
- rwb = q->rq_wb;
if (val == -1)
- rwb->min_lat_nsec = wbt_default_latency_nsec(q);
+ val = wbt_default_latency_nsec(q);
else if (val >= 0)
- rwb->min_lat_nsec = val * 1000ULL;
+ val *= 1000ULL;
- if (rwb->enable_state == WBT_STATE_ON_DEFAULT)
- rwb->enable_state = WBT_STATE_ON_MANUAL;
+ wbt_set_min_lat(q, val);
- wbt_update_limits(rwb);
+ wbt_update_limits(q);
return count;
}
@@ -804,6 +802,21 @@ static void __blk_release_queue(struct work_struct *work)
blk_stat_remove_callback(q, q->poll_cb);
blk_stat_free_callback(q->poll_cb);
+ if (!blk_queue_dead(q)) {
+ /*
+ * Last reference was dropped without having called
+ * blk_cleanup_queue().
+ */
+ WARN_ONCE(blk_queue_init_done(q),
+ "request queue %p has been registered but blk_cleanup_queue() has not been called for that queue\n",
+ q);
+ blk_exit_queue(q);
+ }
+
+ WARN(blk_queue_root_blkg(q),
+ "request queue %p is being released but it has not yet been removed from the blkcg controller\n",
+ q);
+
blk_free_queue_stats(q->stats);
blk_exit_rl(q, &q->root_rl);
@@ -964,7 +977,7 @@ void blk_unregister_queue(struct gendisk *disk)
kobject_del(&q->kobj);
blk_trace_remove_sysfs(disk_to_dev(disk));
- wbt_exit(q);
+ rq_qos_exit(q);
mutex_lock(&q->sysfs_lock);
if (q->request_fn || (q->mq_ops && q->elevator))
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 82282e6fdcf8..a3eede00d302 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -579,8 +579,10 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td)
struct throtl_grp *tg = blkg_to_tg(blkg);
if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
- tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
+ tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) {
low_valid = true;
+ break;
+ }
}
rcu_read_unlock();
@@ -920,12 +922,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
}
/* Calc approx time to dispatch */
- jiffy_wait = ((tg->io_disp[rw] + 1) * HZ) / tg_iops_limit(tg, rw) + 1;
-
- if (jiffy_wait > jiffy_elapsed)
- jiffy_wait = jiffy_wait - jiffy_elapsed;
- else
- jiffy_wait = 1;
+ jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed;
if (wait)
*wait = jiffy_wait;
@@ -2132,12 +2129,8 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
{
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- if (bio->bi_css) {
- if (bio->bi_cg_private)
- blkg_put(tg_to_blkg(bio->bi_cg_private));
- bio->bi_cg_private = tg;
- blkg_get(tg_to_blkg(tg));
- }
+ if (bio->bi_css)
+ bio_associate_blkg(bio, tg_to_blkg(tg));
bio_issue_init(&bio->bi_issue, bio_sectors(bio));
#endif
}
@@ -2285,6 +2278,7 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns)
void blk_throtl_bio_endio(struct bio *bio)
{
+ struct blkcg_gq *blkg;
struct throtl_grp *tg;
u64 finish_time_ns;
unsigned long finish_time;
@@ -2292,20 +2286,18 @@ void blk_throtl_bio_endio(struct bio *bio)
unsigned long lat;
int rw = bio_data_dir(bio);
- tg = bio->bi_cg_private;
- if (!tg)
+ blkg = bio->bi_blkg;
+ if (!blkg)
return;
- bio->bi_cg_private = NULL;
+ tg = blkg_to_tg(blkg);
finish_time_ns = ktime_get_ns();
tg->last_finish_time = finish_time_ns >> 10;
start_time = bio_issue_time(&bio->bi_issue) >> 10;
finish_time = __bio_issue_time(finish_time_ns) >> 10;
- if (!start_time || finish_time <= start_time) {
- blkg_put(tg_to_blkg(tg));
+ if (!start_time || finish_time <= start_time)
return;
- }
lat = finish_time - start_time;
/* this is only for bio based driver */
@@ -2334,8 +2326,6 @@ void blk_throtl_bio_endio(struct bio *bio)
tg->bio_cnt /= 2;
tg->bad_bio_cnt /= 2;
}
-
- blkg_put(tg_to_blkg(tg));
}
#endif
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 4f89b28fa652..1d94a20374fc 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -25,6 +25,7 @@
#include <linux/swap.h>
#include "blk-wbt.h"
+#include "blk-rq-qos.h"
#define CREATE_TRACE_POINTS
#include <trace/events/wbt.h>
@@ -78,28 +79,6 @@ static inline bool rwb_enabled(struct rq_wb *rwb)
return rwb && rwb->wb_normal != 0;
}
-/*
- * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
- * false if 'v' + 1 would be bigger than 'below'.
- */
-static bool atomic_inc_below(atomic_t *v, int below)
-{
- int cur = atomic_read(v);
-
- for (;;) {
- int old;
-
- if (cur >= below)
- return false;
- old = atomic_cmpxchg(v, cur, cur + 1);
- if (old == cur)
- break;
- cur = old;
- }
-
- return true;
-}
-
static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
{
if (rwb_enabled(rwb)) {
@@ -116,7 +95,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb;
+ struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb;
return time_before(jiffies, wb->dirty_sleep + HZ);
}
@@ -144,8 +123,9 @@ static void rwb_wake_all(struct rq_wb *rwb)
}
}
-void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
+static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct)
{
+ struct rq_wb *rwb = RQWB(rqos);
struct rq_wait *rqw;
int inflight, limit;
@@ -186,7 +166,7 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
int diff = limit - inflight;
if (!inflight || diff >= rwb->wb_background / 2)
- wake_up_all(&rqw->wait);
+ wake_up(&rqw->wait);
}
}
@@ -194,10 +174,9 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
* Called on completion of a request. Note that it's also called when
* a request is merged, when the request gets freed.
*/
-void wbt_done(struct rq_wb *rwb, struct request *rq)
+static void wbt_done(struct rq_qos *rqos, struct request *rq)
{
- if (!rwb)
- return;
+ struct rq_wb *rwb = RQWB(rqos);
if (!wbt_is_tracked(rq)) {
if (rwb->sync_cookie == rq) {
@@ -209,72 +188,11 @@ void wbt_done(struct rq_wb *rwb, struct request *rq)
wb_timestamp(rwb, &rwb->last_comp);
} else {
WARN_ON_ONCE(rq == rwb->sync_cookie);
- __wbt_done(rwb, wbt_flags(rq));
+ __wbt_done(rqos, wbt_flags(rq));
}
wbt_clear_state(rq);
}
-/*
- * Return true, if we can't increase the depth further by scaling
- */
-static bool calc_wb_limits(struct rq_wb *rwb)
-{
- unsigned int depth;
- bool ret = false;
-
- if (!rwb->min_lat_nsec) {
- rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
- return false;
- }
-
- /*
- * For QD=1 devices, this is a special case. It's important for those
- * to have one request ready when one completes, so force a depth of
- * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
- * since the device can't have more than that in flight. If we're
- * scaling down, then keep a setting of 1/1/1.
- */
- if (rwb->queue_depth == 1) {
- if (rwb->scale_step > 0)
- rwb->wb_max = rwb->wb_normal = 1;
- else {
- rwb->wb_max = rwb->wb_normal = 2;
- ret = true;
- }
- rwb->wb_background = 1;
- } else {
- /*
- * scale_step == 0 is our default state. If we have suffered
- * latency spikes, step will be > 0, and we shrink the
- * allowed write depths. If step is < 0, we're only doing
- * writes, and we allow a temporarily higher depth to
- * increase performance.
- */
- depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth);
- if (rwb->scale_step > 0)
- depth = 1 + ((depth - 1) >> min(31, rwb->scale_step));
- else if (rwb->scale_step < 0) {
- unsigned int maxd = 3 * rwb->queue_depth / 4;
-
- depth = 1 + ((depth - 1) << -rwb->scale_step);
- if (depth > maxd) {
- depth = maxd;
- ret = true;
- }
- }
-
- /*
- * Set our max/normal/bg queue depths based on how far
- * we have scaled down (->scale_step).
- */
- rwb->wb_max = depth;
- rwb->wb_normal = (rwb->wb_max + 1) / 2;
- rwb->wb_background = (rwb->wb_max + 3) / 4;
- }
-
- return ret;
-}
-
static inline bool stat_sample_valid(struct blk_rq_stat *stat)
{
/*
@@ -307,7 +225,8 @@ enum {
static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
{
- struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
+ struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info;
+ struct rq_depth *rqd = &rwb->rq_depth;
u64 thislat;
/*
@@ -351,7 +270,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
return LAT_EXCEEDED;
}
- if (rwb->scale_step)
+ if (rqd->scale_step)
trace_wbt_stat(bdi, stat);
return LAT_OK;
@@ -359,58 +278,48 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
{
- struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
+ struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info;
+ struct rq_depth *rqd = &rwb->rq_depth;
- trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
- rwb->wb_background, rwb->wb_normal, rwb->wb_max);
+ trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec,
+ rwb->wb_background, rwb->wb_normal, rqd->max_depth);
}
-static void scale_up(struct rq_wb *rwb)
+static void calc_wb_limits(struct rq_wb *rwb)
{
- /*
- * Hit max in previous round, stop here
- */
- if (rwb->scaled_max)
- return;
+ if (rwb->min_lat_nsec == 0) {
+ rwb->wb_normal = rwb->wb_background = 0;
+ } else if (rwb->rq_depth.max_depth <= 2) {
+ rwb->wb_normal = rwb->rq_depth.max_depth;
+ rwb->wb_background = 1;
+ } else {
+ rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2;
+ rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4;
+ }
+}
- rwb->scale_step--;
+static void scale_up(struct rq_wb *rwb)
+{
+ rq_depth_scale_up(&rwb->rq_depth);
+ calc_wb_limits(rwb);
rwb->unknown_cnt = 0;
-
- rwb->scaled_max = calc_wb_limits(rwb);
-
- rwb_wake_all(rwb);
-
- rwb_trace_step(rwb, "step up");
+ rwb_trace_step(rwb, "scale up");
}
-/*
- * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
- * had a latency violation.
- */
static void scale_down(struct rq_wb *rwb, bool hard_throttle)
{
- /*
- * Stop scaling down when we've hit the limit. This also prevents
- * ->scale_step from going to crazy values, if the device can't
- * keep up.
- */
- if (rwb->wb_max == 1)
- return;
-
- if (rwb->scale_step < 0 && hard_throttle)
- rwb->scale_step = 0;
- else
- rwb->scale_step++;
-
- rwb->scaled_max = false;
- rwb->unknown_cnt = 0;
+ rq_depth_scale_down(&rwb->rq_depth, hard_throttle);
calc_wb_limits(rwb);
- rwb_trace_step(rwb, "step down");
+ rwb->unknown_cnt = 0;
+ rwb_wake_all(rwb);
+ rwb_trace_step(rwb, "scale down");
}
static void rwb_arm_timer(struct rq_wb *rwb)
{
- if (rwb->scale_step > 0) {
+ struct rq_depth *rqd = &rwb->rq_depth;
+
+ if (rqd->scale_step > 0) {
/*
* We should speed this up, using some variant of a fast
* integer inverse square root calculation. Since we only do
@@ -418,7 +327,7 @@ static void rwb_arm_timer(struct rq_wb *rwb)
* though.
*/
rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
- int_sqrt((rwb->scale_step + 1) << 8));
+ int_sqrt((rqd->scale_step + 1) << 8));
} else {
/*
* For step < 0, we don't want to increase/decrease the
@@ -433,12 +342,13 @@ static void rwb_arm_timer(struct rq_wb *rwb)
static void wb_timer_fn(struct blk_stat_callback *cb)
{
struct rq_wb *rwb = cb->data;
+ struct rq_depth *rqd = &rwb->rq_depth;
unsigned int inflight = wbt_inflight(rwb);
int status;
status = latency_exceeded(rwb, cb->stat);
- trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
+ trace_wbt_timer(rwb->rqos.q->backing_dev_info, status, rqd->scale_step,
inflight);
/*
@@ -469,9 +379,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
* currently don't have a valid read/write sample. For that
* case, slowly return to center state (step == 0).
*/
- if (rwb->scale_step > 0)
+ if (rqd->scale_step > 0)
scale_up(rwb);
- else if (rwb->scale_step < 0)
+ else if (rqd->scale_step < 0)
scale_down(rwb, false);
break;
default:
@@ -481,19 +391,50 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
/*
* Re-arm timer, if we have IO in flight
*/
- if (rwb->scale_step || inflight)
+ if (rqd->scale_step || inflight)
rwb_arm_timer(rwb);
}
-void wbt_update_limits(struct rq_wb *rwb)
+static void __wbt_update_limits(struct rq_wb *rwb)
{
- rwb->scale_step = 0;
- rwb->scaled_max = false;
+ struct rq_depth *rqd = &rwb->rq_depth;
+
+ rqd->scale_step = 0;
+ rqd->scaled_max = false;
+
+ rq_depth_calc_max_depth(rqd);
calc_wb_limits(rwb);
rwb_wake_all(rwb);
}
+void wbt_update_limits(struct request_queue *q)
+{
+ struct rq_qos *rqos = wbt_rq_qos(q);
+ if (!rqos)
+ return;
+ __wbt_update_limits(RQWB(rqos));
+}
+
+u64 wbt_get_min_lat(struct request_queue *q)
+{
+ struct rq_qos *rqos = wbt_rq_qos(q);
+ if (!rqos)
+ return 0;
+ return RQWB(rqos)->min_lat_nsec;
+}
+
+void wbt_set_min_lat(struct request_queue *q, u64 val)
+{
+ struct rq_qos *rqos = wbt_rq_qos(q);
+ if (!rqos)
+ return;
+ RQWB(rqos)->min_lat_nsec = val;
+ RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
+ __wbt_update_limits(RQWB(rqos));
+}
+
+
static bool close_io(struct rq_wb *rwb)
{
const unsigned long now = jiffies;
@@ -520,7 +461,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
* IO for a bit.
*/
if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
- limit = rwb->wb_max;
+ limit = rwb->rq_depth.max_depth;
else if ((rw & REQ_BACKGROUND) || close_io(rwb)) {
/*
* If less than 100ms since we completed unrelated IO,
@@ -533,30 +474,6 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
return limit;
}
-static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
- wait_queue_entry_t *wait, unsigned long rw)
-{
- /*
- * inc it here even if disabled, since we'll dec it at completion.
- * this only happens if the task was sleeping in __wbt_wait(),
- * and someone turned it off at the same time.
- */
- if (!rwb_enabled(rwb)) {
- atomic_inc(&rqw->inflight);
- return true;
- }
-
- /*
- * If the waitqueue is already active and we are not the next
- * in line to be woken up, wait for our turn.
- */
- if (waitqueue_active(&rqw->wait) &&
- rqw->wait.head.next != &wait->entry)
- return false;
-
- return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
-}
-
/*
* Block if we will exceed our limit, or if we are currently waiting for
* the timer to kick off queuing again.
@@ -567,16 +484,32 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
__acquires(lock)
{
struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
- DEFINE_WAIT(wait);
+ DECLARE_WAITQUEUE(wait, current);
- if (may_queue(rwb, rqw, &wait, rw))
+ /*
+ * inc it here even if disabled, since we'll dec it at completion.
+ * this only happens if the task was sleeping in __wbt_wait(),
+ * and someone turned it off at the same time.
+ */
+ if (!rwb_enabled(rwb)) {
+ atomic_inc(&rqw->inflight);
return;
+ }
+ if (!waitqueue_active(&rqw->wait)
+ && rq_wait_inc_below(rqw, get_limit(rwb, rw)))
+ return;
+
+ add_wait_queue_exclusive(&rqw->wait, &wait);
do {
- prepare_to_wait_exclusive(&rqw->wait, &wait,
- TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE);
- if (may_queue(rwb, rqw, &wait, rw))
+ if (!rwb_enabled(rwb)) {
+ atomic_inc(&rqw->inflight);
+ break;
+ }
+
+ if (rq_wait_inc_below(rqw, get_limit(rwb, rw)))
break;
if (lock) {
@@ -587,7 +520,8 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
io_schedule();
} while (1);
- finish_wait(&rqw->wait, &wait);
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&rqw->wait, &wait);
}
static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
@@ -608,43 +542,72 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
}
}
+static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio)
+{
+ enum wbt_flags flags = 0;
+
+ if (bio_op(bio) == REQ_OP_READ) {
+ flags = WBT_READ;
+ } else if (wbt_should_throttle(rwb, bio)) {
+ if (current_is_kswapd())
+ flags |= WBT_KSWAPD;
+ if (bio_op(bio) == REQ_OP_DISCARD)
+ flags |= WBT_DISCARD;
+ flags |= WBT_TRACKED;
+ }
+ return flags;
+}
+
+static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio)
+{
+ struct rq_wb *rwb = RQWB(rqos);
+ enum wbt_flags flags = bio_to_wbt_flags(rwb, bio);
+ __wbt_done(rqos, flags);
+}
+
/*
* Returns true if the IO request should be accounted, false if not.
* May sleep, if we have exceeded the writeback limits. Caller can pass
* in an irq held spinlock, if it holds one when calling this function.
* If we do sleep, we'll release and re-grab it.
*/
-enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
+static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock)
{
- enum wbt_flags ret = 0;
+ struct rq_wb *rwb = RQWB(rqos);
+ enum wbt_flags flags;
if (!rwb_enabled(rwb))
- return 0;
+ return;
- if (bio_op(bio) == REQ_OP_READ)
- ret = WBT_READ;
+ flags = bio_to_wbt_flags(rwb, bio);
if (!wbt_should_throttle(rwb, bio)) {
- if (ret & WBT_READ)
+ if (flags & WBT_READ)
wb_timestamp(rwb, &rwb->last_issue);
- return ret;
+ return;
}
if (current_is_kswapd())
- ret |= WBT_KSWAPD;
+ flags |= WBT_KSWAPD;
if (bio_op(bio) == REQ_OP_DISCARD)
- ret |= WBT_DISCARD;
+ flags |= WBT_DISCARD;
- __wbt_wait(rwb, ret, bio->bi_opf, lock);
+ __wbt_wait(rwb, flags, bio->bi_opf, lock);
if (!blk_stat_is_active(rwb->cb))
rwb_arm_timer(rwb);
+}
- return ret | WBT_TRACKED;
+static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
+{
+ struct rq_wb *rwb = RQWB(rqos);
+ rq->wbt_flags |= bio_to_wbt_flags(rwb, bio);
}
-void wbt_issue(struct rq_wb *rwb, struct request *rq)
+void wbt_issue(struct rq_qos *rqos, struct request *rq)
{
+ struct rq_wb *rwb = RQWB(rqos);
+
if (!rwb_enabled(rwb))
return;
@@ -661,8 +624,9 @@ void wbt_issue(struct rq_wb *rwb, struct request *rq)
}
}
-void wbt_requeue(struct rq_wb *rwb, struct request *rq)
+void wbt_requeue(struct rq_qos *rqos, struct request *rq)
{
+ struct rq_wb *rwb = RQWB(rqos);
if (!rwb_enabled(rwb))
return;
if (rq == rwb->sync_cookie) {
@@ -671,39 +635,30 @@ void wbt_requeue(struct rq_wb *rwb, struct request *rq)
}
}
-void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
+void wbt_set_queue_depth(struct request_queue *q, unsigned int depth)
{
- if (rwb) {
- rwb->queue_depth = depth;
- wbt_update_limits(rwb);
+ struct rq_qos *rqos = wbt_rq_qos(q);
+ if (rqos) {
+ RQWB(rqos)->rq_depth.queue_depth = depth;
+ __wbt_update_limits(RQWB(rqos));
}
}
-void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
+void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
{
- if (rwb)
- rwb->wc = write_cache_on;
+ struct rq_qos *rqos = wbt_rq_qos(q);
+ if (rqos)
+ RQWB(rqos)->wc = write_cache_on;
}
/*
- * Disable wbt, if enabled by default.
- */
-void wbt_disable_default(struct request_queue *q)
-{
- struct rq_wb *rwb = q->rq_wb;
-
- if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT)
- wbt_exit(q);
-}
-EXPORT_SYMBOL_GPL(wbt_disable_default);
-
-/*
* Enable wbt if defaults are configured that way
*/
void wbt_enable_default(struct request_queue *q)
{
+ struct rq_qos *rqos = wbt_rq_qos(q);
/* Throttling already enabled? */
- if (q->rq_wb)
+ if (rqos)
return;
/* Queue not registered? Maybe shutting down... */
@@ -741,6 +696,42 @@ static int wbt_data_dir(const struct request *rq)
return -1;
}
+static void wbt_exit(struct rq_qos *rqos)
+{
+ struct rq_wb *rwb = RQWB(rqos);
+ struct request_queue *q = rqos->q;
+
+ blk_stat_remove_callback(q, rwb->cb);
+ blk_stat_free_callback(rwb->cb);
+ kfree(rwb);
+}
+
+/*
+ * Disable wbt, if enabled by default.
+ */
+void wbt_disable_default(struct request_queue *q)
+{
+ struct rq_qos *rqos = wbt_rq_qos(q);
+ struct rq_wb *rwb;
+ if (!rqos)
+ return;
+ rwb = RQWB(rqos);
+ if (rwb->enable_state == WBT_STATE_ON_DEFAULT)
+ rwb->wb_normal = 0;
+}
+EXPORT_SYMBOL_GPL(wbt_disable_default);
+
+
+static struct rq_qos_ops wbt_rqos_ops = {
+ .throttle = wbt_wait,
+ .issue = wbt_issue,
+ .track = wbt_track,
+ .requeue = wbt_requeue,
+ .done = wbt_done,
+ .cleanup = wbt_cleanup,
+ .exit = wbt_exit,
+};
+
int wbt_init(struct request_queue *q)
{
struct rq_wb *rwb;
@@ -756,39 +747,29 @@ int wbt_init(struct request_queue *q)
return -ENOMEM;
}
- for (i = 0; i < WBT_NUM_RWQ; i++) {
- atomic_set(&rwb->rq_wait[i].inflight, 0);
- init_waitqueue_head(&rwb->rq_wait[i].wait);
- }
+ for (i = 0; i < WBT_NUM_RWQ; i++)
+ rq_wait_init(&rwb->rq_wait[i]);
+ rwb->rqos.id = RQ_QOS_WBT;
+ rwb->rqos.ops = &wbt_rqos_ops;
+ rwb->rqos.q = q;
rwb->last_comp = rwb->last_issue = jiffies;
- rwb->queue = q;
rwb->win_nsec = RWB_WINDOW_NSEC;
rwb->enable_state = WBT_STATE_ON_DEFAULT;
- wbt_update_limits(rwb);
+ rwb->wc = 1;
+ rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
+ __wbt_update_limits(rwb);
/*
* Assign rwb and add the stats callback.
*/
- q->rq_wb = rwb;
+ rq_qos_add(q, &rwb->rqos);
blk_stat_add_callback(q, rwb->cb);
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
- wbt_set_queue_depth(rwb, blk_queue_depth(q));
- wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
+ wbt_set_queue_depth(q, blk_queue_depth(q));
+ wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
return 0;
}
-
-void wbt_exit(struct request_queue *q)
-{
- struct rq_wb *rwb = q->rq_wb;
-
- if (rwb) {
- blk_stat_remove_callback(q, rwb->cb);
- blk_stat_free_callback(rwb->cb);
- q->rq_wb = NULL;
- kfree(rwb);
- }
-}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 300df531d0a6..f47218d5b3b2 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -9,6 +9,7 @@
#include <linux/ktime.h>
#include "blk-stat.h"
+#include "blk-rq-qos.h"
enum wbt_flags {
WBT_TRACKED = 1, /* write, tracked for throttling */
@@ -35,20 +36,12 @@ enum {
WBT_STATE_ON_MANUAL = 2,
};
-struct rq_wait {
- wait_queue_head_t wait;
- atomic_t inflight;
-};
-
struct rq_wb {
/*
* Settings that govern how we throttle
*/
unsigned int wb_background; /* background writeback */
unsigned int wb_normal; /* normal writeback */
- unsigned int wb_max; /* max throughput writeback */
- int scale_step;
- bool scaled_max;
short enable_state; /* WBT_STATE_* */
@@ -67,15 +60,20 @@ struct rq_wb {
void *sync_cookie;
unsigned int wc;
- unsigned int queue_depth;
unsigned long last_issue; /* last non-throttled issue */
unsigned long last_comp; /* last non-throttled comp */
unsigned long min_lat_nsec;
- struct request_queue *queue;
+ struct rq_qos rqos;
struct rq_wait rq_wait[WBT_NUM_RWQ];
+ struct rq_depth rq_depth;
};
+static inline struct rq_wb *RQWB(struct rq_qos *rqos)
+{
+ return container_of(rqos, struct rq_wb, rqos);
+}
+
static inline unsigned int wbt_inflight(struct rq_wb *rwb)
{
unsigned int i, ret = 0;
@@ -86,26 +84,19 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb)
return ret;
}
-#ifdef CONFIG_BLK_WBT
-static inline void wbt_track(struct request *rq, enum wbt_flags flags)
-{
- rq->wbt_flags |= flags;
-}
+#ifdef CONFIG_BLK_WBT
-void __wbt_done(struct rq_wb *, enum wbt_flags);
-void wbt_done(struct rq_wb *, struct request *);
-enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *);
int wbt_init(struct request_queue *);
-void wbt_exit(struct request_queue *);
-void wbt_update_limits(struct rq_wb *);
-void wbt_requeue(struct rq_wb *, struct request *);
-void wbt_issue(struct rq_wb *, struct request *);
+void wbt_update_limits(struct request_queue *);
void wbt_disable_default(struct request_queue *);
void wbt_enable_default(struct request_queue *);
-void wbt_set_queue_depth(struct rq_wb *, unsigned int);
-void wbt_set_write_cache(struct rq_wb *, bool);
+u64 wbt_get_min_lat(struct request_queue *q);
+void wbt_set_min_lat(struct request_queue *q, u64 val);
+
+void wbt_set_queue_depth(struct request_queue *, unsigned int);
+void wbt_set_write_cache(struct request_queue *, bool);
u64 wbt_default_latency_nsec(struct request_queue *);
@@ -114,43 +105,30 @@ u64 wbt_default_latency_nsec(struct request_queue *);
static inline void wbt_track(struct request *rq, enum wbt_flags flags)
{
}
-static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags)
-{
-}
-static inline void wbt_done(struct rq_wb *rwb, struct request *rq)
-{
-}
-static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio,
- spinlock_t *lock)
-{
- return 0;
-}
static inline int wbt_init(struct request_queue *q)
{
return -EINVAL;
}
-static inline void wbt_exit(struct request_queue *q)
-{
-}
-static inline void wbt_update_limits(struct rq_wb *rwb)
+static inline void wbt_update_limits(struct request_queue *q)
{
}
-static inline void wbt_requeue(struct rq_wb *rwb, struct request *rq)
+static inline void wbt_disable_default(struct request_queue *q)
{
}
-static inline void wbt_issue(struct rq_wb *rwb, struct request *rq)
+static inline void wbt_enable_default(struct request_queue *q)
{
}
-static inline void wbt_disable_default(struct request_queue *q)
+static inline void wbt_set_queue_depth(struct request_queue *q, unsigned int depth)
{
}
-static inline void wbt_enable_default(struct request_queue *q)
+static inline void wbt_set_write_cache(struct request_queue *q, bool wc)
{
}
-static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
+static inline u64 wbt_get_min_lat(struct request_queue *q)
{
+ return 0;
}
-static inline void wbt_set_write_cache(struct rq_wb *rwb, bool wc)
+static inline void wbt_set_min_lat(struct request_queue *q, u64 val)
{
}
static inline u64 wbt_default_latency_nsec(struct request_queue *q)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 51000914e23f..c461cf63f1f4 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -200,7 +200,7 @@ int blkdev_report_zones(struct block_device *bdev,
/* Get header in the first page */
ofst = 0;
if (!nr_rep) {
- hdr = (struct blk_zone_report_hdr *) addr;
+ hdr = addr;
nr_rep = hdr->nr_zones;
ofst = sizeof(struct blk_zone_report_hdr);
}
diff --git a/block/blk.h b/block/blk.h
index 8d23aea96ce9..d4d67e948920 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -130,6 +130,7 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
int blk_init_rl(struct request_list *rl, struct request_queue *q,
gfp_t gfp_mask);
void blk_exit_rl(struct request_queue *q, struct request_list *rl);
+void blk_exit_queue(struct request_queue *q);
void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
struct bio *bio);
void blk_queue_bypass_start(struct request_queue *q);
@@ -412,4 +413,10 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
extern void blk_drain_queue(struct request_queue *q);
+#ifdef CONFIG_BLK_CGROUP_IOLATENCY
+extern int blk_iolatency_init(struct request_queue *q);
+#else
+static inline int blk_iolatency_init(struct request_queue *q) { return 0; }
+#endif
+
#endif /* BLK_INTERNAL_H */
diff --git a/block/bounce.c b/block/bounce.c
index fd31347b7836..bc63b3a2d18c 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -195,6 +195,73 @@ static void bounce_end_io_read_isa(struct bio *bio)
__bounce_end_io_read(bio, &isa_page_pool);
}
+static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
+ struct bio_set *bs)
+{
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ struct bio *bio;
+
+ /*
+ * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
+ * bio_src->bi_io_vec to bio->bi_io_vec.
+ *
+ * We can't do that anymore, because:
+ *
+ * - The point of cloning the biovec is to produce a bio with a biovec
+ * the caller can modify: bi_idx and bi_bvec_done should be 0.
+ *
+ * - The original bio could've had more than BIO_MAX_PAGES biovecs; if
+ * we tried to clone the whole thing bio_alloc_bioset() would fail.
+ * But the clone should succeed as long as the number of biovecs we
+ * actually need to allocate is fewer than BIO_MAX_PAGES.
+ *
+ * - Lastly, bi_vcnt should not be looked at or relied upon by code
+ * that does not own the bio - reason being drivers don't use it for
+ * iterating over the biovec anymore, so expecting it to be kept up
+ * to date (i.e. for clones that share the parent biovec) is just
+ * asking for trouble and would force extra work on
+ * __bio_clone_fast() anyways.
+ */
+
+ bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
+ if (!bio)
+ return NULL;
+ bio->bi_disk = bio_src->bi_disk;
+ bio->bi_opf = bio_src->bi_opf;
+ bio->bi_write_hint = bio_src->bi_write_hint;
+ bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
+ bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
+
+ switch (bio_op(bio)) {
+ case REQ_OP_DISCARD:
+ case REQ_OP_SECURE_ERASE:
+ case REQ_OP_WRITE_ZEROES:
+ break;
+ case REQ_OP_WRITE_SAME:
+ bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
+ break;
+ default:
+ bio_for_each_segment(bv, bio_src, iter)
+ bio->bi_io_vec[bio->bi_vcnt++] = bv;
+ break;
+ }
+
+ if (bio_integrity(bio_src)) {
+ int ret;
+
+ ret = bio_integrity_clone(bio, bio_src, gfp_mask);
+ if (ret < 0) {
+ bio_put(bio);
+ return NULL;
+ }
+ }
+
+ bio_clone_blkcg_association(bio, bio_src);
+
+ return bio;
+}
+
static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
mempool_t *pool)
{
@@ -222,7 +289,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
generic_make_request(*bio_orig);
*bio_orig = bio;
}
- bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
+ bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL :
&bounce_bio_set);
bio_for_each_segment_all(to, bio, i) {
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 9419def8c017..f3501cdaf1a6 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -48,9 +48,8 @@ static int bsg_transport_fill_hdr(struct request *rq, struct sg_io_v4 *hdr,
job->request_len = hdr->request_len;
job->request = memdup_user(uptr64(hdr->request), hdr->request_len);
- if (IS_ERR(job->request))
- return PTR_ERR(job->request);
- return 0;
+
+ return PTR_ERR_OR_ZERO(job->request);
}
static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr)
diff --git a/block/bsg.c b/block/bsg.c
index 3da540faf673..db588add6ba6 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -13,11 +13,9 @@
#include <linux/init.h>
#include <linux/file.h>
#include <linux/blkdev.h>
-#include <linux/poll.h>
#include <linux/cdev.h>
#include <linux/jiffies.h>
#include <linux/percpu.h>
-#include <linux/uio.h>
#include <linux/idr.h>
#include <linux/bsg.h>
#include <linux/slab.h>
@@ -38,21 +36,10 @@
struct bsg_device {
struct request_queue *queue;
spinlock_t lock;
- struct list_head busy_list;
- struct list_head done_list;
struct hlist_node dev_list;
atomic_t ref_count;
- int queued_cmds;
- int done_cmds;
- wait_queue_head_t wq_done;
- wait_queue_head_t wq_free;
char name[20];
int max_queue;
- unsigned long flags;
-};
-
-enum {
- BSG_F_BLOCK = 1,
};
#define BSG_DEFAULT_CMDS 64
@@ -67,64 +54,6 @@ static struct hlist_head bsg_device_list[BSG_LIST_ARRAY_SIZE];
static struct class *bsg_class;
static int bsg_major;
-static struct kmem_cache *bsg_cmd_cachep;
-
-/*
- * our internal command type
- */
-struct bsg_command {
- struct bsg_device *bd;
- struct list_head list;
- struct request *rq;
- struct bio *bio;
- struct bio *bidi_bio;
- int err;
- struct sg_io_v4 hdr;
-};
-
-static void bsg_free_command(struct bsg_command *bc)
-{
- struct bsg_device *bd = bc->bd;
- unsigned long flags;
-
- kmem_cache_free(bsg_cmd_cachep, bc);
-
- spin_lock_irqsave(&bd->lock, flags);
- bd->queued_cmds--;
- spin_unlock_irqrestore(&bd->lock, flags);
-
- wake_up(&bd->wq_free);
-}
-
-static struct bsg_command *bsg_alloc_command(struct bsg_device *bd)
-{
- struct bsg_command *bc = ERR_PTR(-EINVAL);
-
- spin_lock_irq(&bd->lock);
-
- if (bd->queued_cmds >= bd->max_queue)
- goto out;
-
- bd->queued_cmds++;
- spin_unlock_irq(&bd->lock);
-
- bc = kmem_cache_zalloc(bsg_cmd_cachep, GFP_KERNEL);
- if (unlikely(!bc)) {
- spin_lock_irq(&bd->lock);
- bd->queued_cmds--;
- bc = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- bc->bd = bd;
- INIT_LIST_HEAD(&bc->list);
- bsg_dbg(bd, "returning free cmd %p\n", bc);
- return bc;
-out:
- spin_unlock_irq(&bd->lock);
- return bc;
-}
-
static inline struct hlist_head *bsg_dev_idx_hash(int index)
{
return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)];
@@ -285,101 +214,6 @@ out:
return ERR_PTR(ret);
}
-/*
- * async completion call-back from the block layer, when scsi/ide/whatever
- * calls end_that_request_last() on a request
- */
-static void bsg_rq_end_io(struct request *rq, blk_status_t status)
-{
- struct bsg_command *bc = rq->end_io_data;
- struct bsg_device *bd = bc->bd;
- unsigned long flags;
-
- bsg_dbg(bd, "finished rq %p bc %p, bio %p\n",
- rq, bc, bc->bio);
-
- bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration);
-
- spin_lock_irqsave(&bd->lock, flags);
- list_move_tail(&bc->list, &bd->done_list);
- bd->done_cmds++;
- spin_unlock_irqrestore(&bd->lock, flags);
-
- wake_up(&bd->wq_done);
-}
-
-/*
- * do final setup of a 'bc' and submit the matching 'rq' to the block
- * layer for io
- */
-static void bsg_add_command(struct bsg_device *bd, struct request_queue *q,
- struct bsg_command *bc, struct request *rq)
-{
- int at_head = (0 == (bc->hdr.flags & BSG_FLAG_Q_AT_TAIL));
-
- /*
- * add bc command to busy queue and submit rq for io
- */
- bc->rq = rq;
- bc->bio = rq->bio;
- if (rq->next_rq)
- bc->bidi_bio = rq->next_rq->bio;
- bc->hdr.duration = jiffies;
- spin_lock_irq(&bd->lock);
- list_add_tail(&bc->list, &bd->busy_list);
- spin_unlock_irq(&bd->lock);
-
- bsg_dbg(bd, "queueing rq %p, bc %p\n", rq, bc);
-
- rq->end_io_data = bc;
- blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io);
-}
-
-static struct bsg_command *bsg_next_done_cmd(struct bsg_device *bd)
-{
- struct bsg_command *bc = NULL;
-
- spin_lock_irq(&bd->lock);
- if (bd->done_cmds) {
- bc = list_first_entry(&bd->done_list, struct bsg_command, list);
- list_del(&bc->list);
- bd->done_cmds--;
- }
- spin_unlock_irq(&bd->lock);
-
- return bc;
-}
-
-/*
- * Get a finished command from the done list
- */
-static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd)
-{
- struct bsg_command *bc;
- int ret;
-
- do {
- bc = bsg_next_done_cmd(bd);
- if (bc)
- break;
-
- if (!test_bit(BSG_F_BLOCK, &bd->flags)) {
- bc = ERR_PTR(-EAGAIN);
- break;
- }
-
- ret = wait_event_interruptible(bd->wq_done, bd->done_cmds);
- if (ret) {
- bc = ERR_PTR(-ERESTARTSYS);
- break;
- }
- } while (1);
-
- bsg_dbg(bd, "returning done %p\n", bc);
-
- return bc;
-}
-
static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
struct bio *bio, struct bio *bidi_bio)
{
@@ -398,234 +232,6 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
return ret;
}
-static bool bsg_complete(struct bsg_device *bd)
-{
- bool ret = false;
- bool spin;
-
- do {
- spin_lock_irq(&bd->lock);
-
- BUG_ON(bd->done_cmds > bd->queued_cmds);
-
- /*
- * All commands consumed.
- */
- if (bd->done_cmds == bd->queued_cmds)
- ret = true;
-
- spin = !test_bit(BSG_F_BLOCK, &bd->flags);
-
- spin_unlock_irq(&bd->lock);
- } while (!ret && spin);
-
- return ret;
-}
-
-static int bsg_complete_all_commands(struct bsg_device *bd)
-{
- struct bsg_command *bc;
- int ret, tret;
-
- bsg_dbg(bd, "entered\n");
-
- /*
- * wait for all commands to complete
- */
- io_wait_event(bd->wq_done, bsg_complete(bd));
-
- /*
- * discard done commands
- */
- ret = 0;
- do {
- spin_lock_irq(&bd->lock);
- if (!bd->queued_cmds) {
- spin_unlock_irq(&bd->lock);
- break;
- }
- spin_unlock_irq(&bd->lock);
-
- bc = bsg_get_done_cmd(bd);
- if (IS_ERR(bc))
- break;
-
- tret = blk_complete_sgv4_hdr_rq(bc->rq, &bc->hdr, bc->bio,
- bc->bidi_bio);
- if (!ret)
- ret = tret;
-
- bsg_free_command(bc);
- } while (1);
-
- return ret;
-}
-
-static int
-__bsg_read(char __user *buf, size_t count, struct bsg_device *bd,
- const struct iovec *iov, ssize_t *bytes_read)
-{
- struct bsg_command *bc;
- int nr_commands, ret;
-
- if (count % sizeof(struct sg_io_v4))
- return -EINVAL;
-
- ret = 0;
- nr_commands = count / sizeof(struct sg_io_v4);
- while (nr_commands) {
- bc = bsg_get_done_cmd(bd);
- if (IS_ERR(bc)) {
- ret = PTR_ERR(bc);
- break;
- }
-
- /*
- * this is the only case where we need to copy data back
- * after completing the request. so do that here,
- * bsg_complete_work() cannot do that for us
- */
- ret = blk_complete_sgv4_hdr_rq(bc->rq, &bc->hdr, bc->bio,
- bc->bidi_bio);
-
- if (copy_to_user(buf, &bc->hdr, sizeof(bc->hdr)))
- ret = -EFAULT;
-
- bsg_free_command(bc);
-
- if (ret)
- break;
-
- buf += sizeof(struct sg_io_v4);
- *bytes_read += sizeof(struct sg_io_v4);
- nr_commands--;
- }
-
- return ret;
-}
-
-static inline void bsg_set_block(struct bsg_device *bd, struct file *file)
-{
- if (file->f_flags & O_NONBLOCK)
- clear_bit(BSG_F_BLOCK, &bd->flags);
- else
- set_bit(BSG_F_BLOCK, &bd->flags);
-}
-
-/*
- * Check if the error is a "real" error that we should return.
- */
-static inline int err_block_err(int ret)
-{
- if (ret && ret != -ENOSPC && ret != -ENODATA && ret != -EAGAIN)
- return 1;
-
- return 0;
-}
-
-static ssize_t
-bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
-{
- struct bsg_device *bd = file->private_data;
- int ret;
- ssize_t bytes_read;
-
- bsg_dbg(bd, "read %zd bytes\n", count);
-
- bsg_set_block(bd, file);
-
- bytes_read = 0;
- ret = __bsg_read(buf, count, bd, NULL, &bytes_read);
- *ppos = bytes_read;
-
- if (!bytes_read || err_block_err(ret))
- bytes_read = ret;
-
- return bytes_read;
-}
-
-static int __bsg_write(struct bsg_device *bd, const char __user *buf,
- size_t count, ssize_t *bytes_written, fmode_t mode)
-{
- struct bsg_command *bc;
- struct request *rq;
- int ret, nr_commands;
-
- if (count % sizeof(struct sg_io_v4))
- return -EINVAL;
-
- nr_commands = count / sizeof(struct sg_io_v4);
- rq = NULL;
- bc = NULL;
- ret = 0;
- while (nr_commands) {
- struct request_queue *q = bd->queue;
-
- bc = bsg_alloc_command(bd);
- if (IS_ERR(bc)) {
- ret = PTR_ERR(bc);
- bc = NULL;
- break;
- }
-
- if (copy_from_user(&bc->hdr, buf, sizeof(bc->hdr))) {
- ret = -EFAULT;
- break;
- }
-
- /*
- * get a request, fill in the blanks, and add to request queue
- */
- rq = bsg_map_hdr(bd->queue, &bc->hdr, mode);
- if (IS_ERR(rq)) {
- ret = PTR_ERR(rq);
- rq = NULL;
- break;
- }
-
- bsg_add_command(bd, q, bc, rq);
- bc = NULL;
- rq = NULL;
- nr_commands--;
- buf += sizeof(struct sg_io_v4);
- *bytes_written += sizeof(struct sg_io_v4);
- }
-
- if (bc)
- bsg_free_command(bc);
-
- return ret;
-}
-
-static ssize_t
-bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
-{
- struct bsg_device *bd = file->private_data;
- ssize_t bytes_written;
- int ret;
-
- bsg_dbg(bd, "write %zd bytes\n", count);
-
- if (unlikely(uaccess_kernel()))
- return -EINVAL;
-
- bsg_set_block(bd, file);
-
- bytes_written = 0;
- ret = __bsg_write(bd, buf, count, &bytes_written, file->f_mode);
-
- *ppos = bytes_written;
-
- /*
- * return bytes written on non-fatal errors
- */
- if (!bytes_written || err_block_err(ret))
- bytes_written = ret;
-
- bsg_dbg(bd, "returning %zd\n", bytes_written);
- return bytes_written;
-}
-
static struct bsg_device *bsg_alloc_device(void)
{
struct bsg_device *bd;
@@ -635,29 +241,20 @@ static struct bsg_device *bsg_alloc_device(void)
return NULL;
spin_lock_init(&bd->lock);
-
bd->max_queue = BSG_DEFAULT_CMDS;
-
- INIT_LIST_HEAD(&bd->busy_list);
- INIT_LIST_HEAD(&bd->done_list);
INIT_HLIST_NODE(&bd->dev_list);
-
- init_waitqueue_head(&bd->wq_free);
- init_waitqueue_head(&bd->wq_done);
return bd;
}
static int bsg_put_device(struct bsg_device *bd)
{
- int ret = 0, do_free;
struct request_queue *q = bd->queue;
mutex_lock(&bsg_mutex);
- do_free = atomic_dec_and_test(&bd->ref_count);
- if (!do_free) {
+ if (!atomic_dec_and_test(&bd->ref_count)) {
mutex_unlock(&bsg_mutex);
- goto out;
+ return 0;
}
hlist_del(&bd->dev_list);
@@ -668,20 +265,9 @@ static int bsg_put_device(struct bsg_device *bd)
/*
* close can always block
*/
- set_bit(BSG_F_BLOCK, &bd->flags);
-
- /*
- * correct error detection baddies here again. it's the responsibility
- * of the app to properly reap commands before close() if it wants
- * fool-proof error detection
- */
- ret = bsg_complete_all_commands(bd);
-
kfree(bd);
-out:
- if (do_free)
- blk_put_queue(q);
- return ret;
+ blk_put_queue(q);
+ return 0;
}
static struct bsg_device *bsg_add_device(struct inode *inode,
@@ -704,8 +290,6 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
bd->queue = rq;
- bsg_set_block(bd, file);
-
atomic_set(&bd->ref_count, 1);
hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode)));
@@ -779,24 +363,6 @@ static int bsg_release(struct inode *inode, struct file *file)
return bsg_put_device(bd);
}
-static __poll_t bsg_poll(struct file *file, poll_table *wait)
-{
- struct bsg_device *bd = file->private_data;
- __poll_t mask = 0;
-
- poll_wait(file, &bd->wq_done, wait);
- poll_wait(file, &bd->wq_free, wait);
-
- spin_lock_irq(&bd->lock);
- if (!list_empty(&bd->done_list))
- mask |= EPOLLIN | EPOLLRDNORM;
- if (bd->queued_cmds < bd->max_queue)
- mask |= EPOLLOUT;
- spin_unlock_irq(&bd->lock);
-
- return mask;
-}
-
static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct bsg_device *bd = file->private_data;
@@ -870,9 +436,6 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
}
static const struct file_operations bsg_fops = {
- .read = bsg_read,
- .write = bsg_write,
- .poll = bsg_poll,
.open = bsg_open,
.release = bsg_release,
.unlocked_ioctl = bsg_ioctl,
@@ -977,21 +540,12 @@ static int __init bsg_init(void)
int ret, i;
dev_t devid;
- bsg_cmd_cachep = kmem_cache_create("bsg_cmd",
- sizeof(struct bsg_command), 0, 0, NULL);
- if (!bsg_cmd_cachep) {
- printk(KERN_ERR "bsg: failed creating slab cache\n");
- return -ENOMEM;
- }
-
for (i = 0; i < BSG_LIST_ARRAY_SIZE; i++)
INIT_HLIST_HEAD(&bsg_device_list[i]);
bsg_class = class_create(THIS_MODULE, "bsg");
- if (IS_ERR(bsg_class)) {
- ret = PTR_ERR(bsg_class);
- goto destroy_kmemcache;
- }
+ if (IS_ERR(bsg_class))
+ return PTR_ERR(bsg_class);
bsg_class->devnode = bsg_devnode;
ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg");
@@ -1012,8 +566,6 @@ unregister_chrdev:
unregister_chrdev_region(MKDEV(bsg_major, 0), BSG_MAX_DEVS);
destroy_bsg_class:
class_destroy(bsg_class);
-destroy_kmemcache:
- kmem_cache_destroy(bsg_cmd_cachep);
return ret;
}
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 82b6c27b3245..2eb87444b157 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3666,6 +3666,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
switch (ioprio_class) {
default:
printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
+ /* fall through */
case IOPRIO_CLASS_NONE:
/*
* no prio set, inherit CPU scheduling settings
@@ -4735,12 +4736,13 @@ USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency);
static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
{ \
struct cfq_data *cfqd = e->elevator_data; \
- unsigned int __data; \
+ unsigned int __data, __min = (MIN), __max = (MAX); \
+ \
cfq_var_store(&__data, (page)); \
- if (__data < (MIN)) \
- __data = (MIN); \
- else if (__data > (MAX)) \
- __data = (MAX); \
+ if (__data < __min) \
+ __data = __min; \
+ else if (__data > __max) \
+ __data = __max; \
if (__CONV) \
*(__PTR) = (u64)__data * NSEC_PER_MSEC; \
else \
@@ -4769,12 +4771,13 @@ STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX,
static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
{ \
struct cfq_data *cfqd = e->elevator_data; \
- unsigned int __data; \
+ unsigned int __data, __min = (MIN), __max = (MAX); \
+ \
cfq_var_store(&__data, (page)); \
- if (__data < (MIN)) \
- __data = (MIN); \
- else if (__data > (MAX)) \
- __data = (MAX); \
+ if (__data < __min) \
+ __data = __min; \
+ else if (__data > __max) \
+ __data = __max; \
*(__PTR) = (u64)__data * NSEC_PER_USEC; \
return count; \
}
diff --git a/block/genhd.c b/block/genhd.c
index f1543a45e73b..8cc719a37b32 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1333,21 +1333,28 @@ static int diskstats_show(struct seq_file *seqf, void *v)
part_round_stats(gp->queue, cpu, hd);
part_stat_unlock();
part_in_flight(gp->queue, hd, inflight);
- seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
- "%u %lu %lu %lu %u %u %u %u\n",
+ seq_printf(seqf, "%4d %7d %s "
+ "%lu %lu %lu %u "
+ "%lu %lu %lu %u "
+ "%u %u %u "
+ "%lu %lu %lu %u\n",
MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
disk_name(gp, hd->partno, buf),
- part_stat_read(hd, ios[READ]),
- part_stat_read(hd, merges[READ]),
- part_stat_read(hd, sectors[READ]),
- jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
- part_stat_read(hd, ios[WRITE]),
- part_stat_read(hd, merges[WRITE]),
- part_stat_read(hd, sectors[WRITE]),
- jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
+ part_stat_read(hd, ios[STAT_READ]),
+ part_stat_read(hd, merges[STAT_READ]),
+ part_stat_read(hd, sectors[STAT_READ]),
+ jiffies_to_msecs(part_stat_read(hd, ticks[STAT_READ])),
+ part_stat_read(hd, ios[STAT_WRITE]),
+ part_stat_read(hd, merges[STAT_WRITE]),
+ part_stat_read(hd, sectors[STAT_WRITE]),
+ jiffies_to_msecs(part_stat_read(hd, ticks[STAT_WRITE])),
inflight[0],
jiffies_to_msecs(part_stat_read(hd, io_ticks)),
- jiffies_to_msecs(part_stat_read(hd, time_in_queue))
+ jiffies_to_msecs(part_stat_read(hd, time_in_queue)),
+ part_stat_read(hd, ios[STAT_DISCARD]),
+ part_stat_read(hd, merges[STAT_DISCARD]),
+ part_stat_read(hd, sectors[STAT_DISCARD]),
+ jiffies_to_msecs(part_stat_read(hd, ticks[STAT_DISCARD]))
);
}
disk_part_iter_exit(&piter);
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 3dcfd4ec0e11..5a8975a1201c 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -130,19 +130,24 @@ ssize_t part_stat_show(struct device *dev,
return sprintf(buf,
"%8lu %8lu %8llu %8u "
"%8lu %8lu %8llu %8u "
- "%8u %8u %8u"
+ "%8u %8u %8u "
+ "%8lu %8lu %8llu %8u"
"\n",
- part_stat_read(p, ios[READ]),
- part_stat_read(p, merges[READ]),
- (unsigned long long)part_stat_read(p, sectors[READ]),
- jiffies_to_msecs(part_stat_read(p, ticks[READ])),
- part_stat_read(p, ios[WRITE]),
- part_stat_read(p, merges[WRITE]),
- (unsigned long long)part_stat_read(p, sectors[WRITE]),
- jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
+ part_stat_read(p, ios[STAT_READ]),
+ part_stat_read(p, merges[STAT_READ]),
+ (unsigned long long)part_stat_read(p, sectors[STAT_READ]),
+ jiffies_to_msecs(part_stat_read(p, ticks[STAT_READ])),
+ part_stat_read(p, ios[STAT_WRITE]),
+ part_stat_read(p, merges[STAT_WRITE]),
+ (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]),
+ jiffies_to_msecs(part_stat_read(p, ticks[STAT_WRITE])),
inflight[0],
jiffies_to_msecs(part_stat_read(p, io_ticks)),
- jiffies_to_msecs(part_stat_read(p, time_in_queue)));
+ jiffies_to_msecs(part_stat_read(p, time_in_queue)),
+ part_stat_read(p, ios[STAT_DISCARD]),
+ part_stat_read(p, merges[STAT_DISCARD]),
+ (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]),
+ jiffies_to_msecs(part_stat_read(p, ticks[STAT_DISCARD])));
}
ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
diff --git a/block/partitions/aix.c b/block/partitions/aix.c
index 007f95eea0e1..903f3ed175d0 100644
--- a/block/partitions/aix.c
+++ b/block/partitions/aix.c
@@ -178,7 +178,7 @@ int aix_partition(struct parsed_partitions *state)
u32 vgda_sector = 0;
u32 vgda_len = 0;
int numlvs = 0;
- struct pvd *pvd;
+ struct pvd *pvd = NULL;
struct lv_info {
unsigned short pps_per_lv;
unsigned short pps_found;
@@ -232,10 +232,11 @@ int aix_partition(struct parsed_partitions *state)
if (lvip[i].pps_per_lv)
foundlvs += 1;
}
+ /* pvd loops depend on n[].name and lvip[].pps_per_lv */
+ pvd = alloc_pvd(state, vgda_sector + 17);
}
put_dev_sector(sect);
}
- pvd = alloc_pvd(state, vgda_sector + 17);
if (pvd) {
int numpps = be16_to_cpu(pvd->pp_count);
int psn_part1 = be32_to_cpu(pvd->psn_part1);
@@ -282,10 +283,14 @@ int aix_partition(struct parsed_partitions *state)
next_lp_ix += 1;
}
for (i = 0; i < state->limit; i += 1)
- if (lvip[i].pps_found && !lvip[i].lv_is_contiguous)
+ if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) {
+ char tmp[sizeof(n[i].name) + 1]; // null char
+
+ snprintf(tmp, sizeof(tmp), "%s", n[i].name);
pr_warn("partition %s (%u pp's found) is "
"not contiguous\n",
- n[i].name, lvip[i].pps_found);
+ tmp, lvip[i].pps_found);
+ }
kfree(pvd);
}
kfree(n);
diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c
index 0417937dfe99..16766f267559 100644
--- a/block/partitions/ldm.c
+++ b/block/partitions/ldm.c
@@ -830,7 +830,6 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
{
char buf[64];
int r_objid, r_name, r_id1, r_id2, len;
- struct vblk_dgrp *dgrp;
BUG_ON (!buffer || !vb);
@@ -853,8 +852,6 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
if (len != get_unaligned_be32(buffer + 0x14))
return false;
- dgrp = &vb->vblk.dgrp;
-
ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf));
return true;
}
diff --git a/block/t10-pi.c b/block/t10-pi.c
index a98db384048f..62aed77d0bb9 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -184,3 +184,113 @@ const struct blk_integrity_profile t10_pi_type3_ip = {
.verify_fn = t10_pi_type3_verify_ip,
};
EXPORT_SYMBOL(t10_pi_type3_ip);
+
+/**
+ * t10_pi_prepare - prepare PI prior submitting request to device
+ * @rq: request with PI that should be prepared
+ * @protection_type: PI type (Type 1/Type 2/Type 3)
+ *
+ * For Type 1/Type 2, the virtual start sector is the one that was
+ * originally submitted by the block layer for the ref_tag usage. Due to
+ * partitioning, MD/DM cloning, etc. the actual physical start sector is
+ * likely to be different. Remap protection information to match the
+ * physical LBA.
+ *
+ * Type 3 does not have a reference tag so no remapping is required.
+ */
+void t10_pi_prepare(struct request *rq, u8 protection_type)
+{
+ const int tuple_sz = rq->q->integrity.tuple_size;
+ u32 ref_tag = t10_pi_ref_tag(rq);
+ struct bio *bio;
+
+ if (protection_type == T10_PI_TYPE3_PROTECTION)
+ return;
+
+ __rq_for_each_bio(bio, rq) {
+ struct bio_integrity_payload *bip = bio_integrity(bio);
+ u32 virt = bip_get_seed(bip) & 0xffffffff;
+ struct bio_vec iv;
+ struct bvec_iter iter;
+
+ /* Already remapped? */
+ if (bip->bip_flags & BIP_MAPPED_INTEGRITY)
+ break;
+
+ bip_for_each_vec(iv, bip, iter) {
+ void *p, *pmap;
+ unsigned int j;
+
+ pmap = kmap_atomic(iv.bv_page);
+ p = pmap + iv.bv_offset;
+ for (j = 0; j < iv.bv_len; j += tuple_sz) {
+ struct t10_pi_tuple *pi = p;
+
+ if (be32_to_cpu(pi->ref_tag) == virt)
+ pi->ref_tag = cpu_to_be32(ref_tag);
+ virt++;
+ ref_tag++;
+ p += tuple_sz;
+ }
+
+ kunmap_atomic(pmap);
+ }
+
+ bip->bip_flags |= BIP_MAPPED_INTEGRITY;
+ }
+}
+EXPORT_SYMBOL(t10_pi_prepare);
+
+/**
+ * t10_pi_complete - prepare PI prior returning request to the block layer
+ * @rq: request with PI that should be prepared
+ * @protection_type: PI type (Type 1/Type 2/Type 3)
+ * @intervals: total elements to prepare
+ *
+ * For Type 1/Type 2, the virtual start sector is the one that was
+ * originally submitted by the block layer for the ref_tag usage. Due to
+ * partitioning, MD/DM cloning, etc. the actual physical start sector is
+ * likely to be different. Since the physical start sector was submitted
+ * to the device, we should remap it back to virtual values expected by the
+ * block layer.
+ *
+ * Type 3 does not have a reference tag so no remapping is required.
+ */
+void t10_pi_complete(struct request *rq, u8 protection_type,
+ unsigned int intervals)
+{
+ const int tuple_sz = rq->q->integrity.tuple_size;
+ u32 ref_tag = t10_pi_ref_tag(rq);
+ struct bio *bio;
+
+ if (protection_type == T10_PI_TYPE3_PROTECTION)
+ return;
+
+ __rq_for_each_bio(bio, rq) {
+ struct bio_integrity_payload *bip = bio_integrity(bio);
+ u32 virt = bip_get_seed(bip) & 0xffffffff;
+ struct bio_vec iv;
+ struct bvec_iter iter;
+
+ bip_for_each_vec(iv, bip, iter) {
+ void *p, *pmap;
+ unsigned int j;
+
+ pmap = kmap_atomic(iv.bv_page);
+ p = pmap + iv.bv_offset;
+ for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) {
+ struct t10_pi_tuple *pi = p;
+
+ if (be32_to_cpu(pi->ref_tag) == ref_tag)
+ pi->ref_tag = cpu_to_be32(virt);
+ virt++;
+ ref_tag++;
+ intervals--;
+ p += tuple_sz;
+ }
+
+ kunmap_atomic(pmap);
+ }
+ }
+}
+EXPORT_SYMBOL(t10_pi_complete);
diff --git a/drivers/Makefile b/drivers/Makefile
index 24cd47014657..a6abd7a856c6 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -76,7 +76,7 @@ obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/
obj-$(CONFIG_NUBUS) += nubus/
obj-y += macintosh/
obj-$(CONFIG_IDE) += ide/
-obj-$(CONFIG_SCSI) += scsi/
+obj-y += scsi/
obj-y += nvme/
obj-$(CONFIG_ATA) += ata/
obj-$(CONFIG_TARGET_CORE) += target/
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index aad1b01447de..8e270962b2f3 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -597,8 +597,9 @@ static int ata_get_identity(struct ata_port *ap, struct scsi_device *sdev,
int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg)
{
int rc = 0;
+ u8 sensebuf[SCSI_SENSE_BUFFERSIZE];
u8 scsi_cmd[MAX_COMMAND_SIZE];
- u8 args[4], *argbuf = NULL, *sensebuf = NULL;
+ u8 args[4], *argbuf = NULL;
int argsize = 0;
enum dma_data_direction data_dir;
struct scsi_sense_hdr sshdr;
@@ -610,10 +611,7 @@ int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg)
if (copy_from_user(args, arg, sizeof(args)))
return -EFAULT;
- sensebuf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_NOIO);
- if (!sensebuf)
- return -ENOMEM;
-
+ memset(sensebuf, 0, sizeof(sensebuf));
memset(scsi_cmd, 0, sizeof(scsi_cmd));
if (args[3]) {
@@ -685,7 +683,6 @@ int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg)
&& copy_to_user(arg + sizeof(args), argbuf, argsize))
rc = -EFAULT;
error:
- kfree(sensebuf);
kfree(argbuf);
return rc;
}
@@ -704,8 +701,9 @@ error:
int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg)
{
int rc = 0;
+ u8 sensebuf[SCSI_SENSE_BUFFERSIZE];
u8 scsi_cmd[MAX_COMMAND_SIZE];
- u8 args[7], *sensebuf = NULL;
+ u8 args[7];
struct scsi_sense_hdr sshdr;
int cmd_result;
@@ -715,10 +713,7 @@ int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg)
if (copy_from_user(args, arg, sizeof(args)))
return -EFAULT;
- sensebuf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_NOIO);
- if (!sensebuf)
- return -ENOMEM;
-
+ memset(sensebuf, 0, sizeof(sensebuf));
memset(scsi_cmd, 0, sizeof(scsi_cmd));
scsi_cmd[0] = ATA_16;
scsi_cmd[1] = (3 << 1); /* Non-data */
@@ -769,7 +764,6 @@ int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg)
}
error:
- kfree(sensebuf);
return rc;
}
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index f6518067aa7d..f99e5c883368 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -21,6 +21,7 @@
#define DAC960_DriverDate "21 Aug 2007"
+#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/miscdevice.h>
@@ -6426,7 +6427,7 @@ static bool DAC960_V2_ExecuteUserCommand(DAC960_Controller_T *Controller,
return true;
}
-static int dac960_proc_show(struct seq_file *m, void *v)
+static int __maybe_unused dac960_proc_show(struct seq_file *m, void *v)
{
unsigned char *StatusMessage = "OK\n";
int ControllerNumber;
@@ -6446,14 +6447,16 @@ static int dac960_proc_show(struct seq_file *m, void *v)
return 0;
}
-static int dac960_initial_status_proc_show(struct seq_file *m, void *v)
+static int __maybe_unused dac960_initial_status_proc_show(struct seq_file *m,
+ void *v)
{
DAC960_Controller_T *Controller = (DAC960_Controller_T *)m->private;
seq_printf(m, "%.*s", Controller->InitialStatusLength, Controller->CombinedStatusBuffer);
return 0;
}
-static int dac960_current_status_proc_show(struct seq_file *m, void *v)
+static int __maybe_unused dac960_current_status_proc_show(struct seq_file *m,
+ void *v)
{
DAC960_Controller_T *Controller = (DAC960_Controller_T *) m->private;
unsigned char *StatusMessage =
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index ad9b687a236a..d4913516823f 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -74,12 +74,12 @@ config AMIGA_Z2RAM
config CDROM
tristate
+ select BLK_SCSI_REQUEST
config GDROM
tristate "SEGA Dreamcast GD-ROM drive"
depends on SH_DREAMCAST
select CDROM
- select BLK_SCSI_REQUEST # only for the generic cdrom code
help
A standard SEGA Dreamcast comes with a modified CD ROM drive called a
"GD-ROM" by SEGA to signify it is capable of reading special disks
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index dc061158b403..8566b188368b 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -36,8 +36,11 @@ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/
obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
-obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o
obj-$(CONFIG_ZRAM) += zram/
+obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o
+null_blk-objs := null_blk_main.o
+null_blk-$(CONFIG_BLK_DEV_ZONED) += null_blk_zoned.o
+
skd-y := skd_main.o
swim_mod-y := swim.o swim_asm.o
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 096882e54095..136dc507d020 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -1137,6 +1137,7 @@ noskb: if (buf)
break;
}
bvcpy(skb, f->buf->bio, f->iter, n);
+ /* fall through */
case ATA_CMD_PIO_WRITE:
case ATA_CMD_PIO_WRITE_EXT:
spin_lock_irq(&d->lock);
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index 697f735b07a4..41060e9cedf2 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -284,8 +284,8 @@ freedev(struct aoedev *d)
e = t + d->ntargets;
for (; t < e && *t; t++)
freetgt(d, *t);
- if (d->bufpool)
- mempool_destroy(d->bufpool);
+
+ mempool_destroy(d->bufpool);
skbpoolfree(d);
minor_free(d->sysminor);
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index bb976598ee43..df8103dd40ac 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -254,20 +254,20 @@ static void copy_from_brd(void *dst, struct brd_device *brd,
* Process a single bvec of a bio.
*/
static int brd_do_bvec(struct brd_device *brd, struct page *page,
- unsigned int len, unsigned int off, bool is_write,
+ unsigned int len, unsigned int off, unsigned int op,
sector_t sector)
{
void *mem;
int err = 0;
- if (is_write) {
+ if (op_is_write(op)) {
err = copy_to_brd_setup(brd, sector, len);
if (err)
goto out;
}
mem = kmap_atomic(page);
- if (!is_write) {
+ if (!op_is_write(op)) {
copy_from_brd(mem + off, brd, sector, len);
flush_dcache_page(page);
} else {
@@ -296,7 +296,7 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
int err;
err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
- op_is_write(bio_op(bio)), sector);
+ bio_op(bio), sector);
if (err)
goto io_error;
sector += len >> SECTOR_SHIFT;
@@ -310,15 +310,15 @@ io_error:
}
static int brd_rw_page(struct block_device *bdev, sector_t sector,
- struct page *page, bool is_write)
+ struct page *page, unsigned int op)
{
struct brd_device *brd = bdev->bd_disk->private_data;
int err;
if (PageTransHuge(page))
return -ENOTSUPP;
- err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
- page_endio(page, is_write, err);
+ err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector);
+ page_endio(page, op_is_write(op), err);
return err;
}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index bc4ed2ed40a2..e35a234b0a8f 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -55,12 +55,10 @@
# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr")))
# define __protected_read_by(x) __attribute__((require_context(x,1,999,"read")))
# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write")))
-# define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call")))
#else
# define __protected_by(x)
# define __protected_read_by(x)
# define __protected_write_by(x)
-# define __must_hold(x)
#endif
/* shared module parameters, defined in drbd_main.c */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index a80809bd3057..ef8212a4b73e 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2103,14 +2103,10 @@ static void drbd_destroy_mempools(void)
mempool_exit(&drbd_md_io_page_pool);
mempool_exit(&drbd_ee_mempool);
mempool_exit(&drbd_request_mempool);
- if (drbd_ee_cache)
- kmem_cache_destroy(drbd_ee_cache);
- if (drbd_request_cache)
- kmem_cache_destroy(drbd_request_cache);
- if (drbd_bm_ext_cache)
- kmem_cache_destroy(drbd_bm_ext_cache);
- if (drbd_al_ext_cache)
- kmem_cache_destroy(drbd_al_ext_cache);
+ kmem_cache_destroy(drbd_ee_cache);
+ kmem_cache_destroy(drbd_request_cache);
+ kmem_cache_destroy(drbd_bm_ext_cache);
+ kmem_cache_destroy(drbd_al_ext_cache);
drbd_ee_cache = NULL;
drbd_request_cache = NULL;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index be9450f5ad1c..75f6b47169e6 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2674,8 +2674,7 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
if (c_min_rate == 0)
return false;
- curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
- (int)part_stat_read(&disk->part0, sectors[1]) -
+ curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
atomic_read(&device->rs_sect_ev);
if (atomic_read(&device->ap_actlog_cnt)
@@ -2790,6 +2789,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
then we would do something smarter here than reading
the block... */
peer_req->flags |= EE_RS_THIN_REQ;
+ /* fall through */
case P_RS_DATA_REQUEST:
peer_req->w.cb = w_e_end_rsdata_req;
fault_type = DRBD_FAULT_RS_RD;
@@ -2968,6 +2968,7 @@ static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold
/* Else fall through to one of the other strategies... */
drbd_warn(device, "Discard younger/older primary did not find a decision\n"
"Using discard-least-changes instead\n");
+ /* fall through */
case ASB_DISCARD_ZERO_CHG:
if (ch_peer == 0 && ch_self == 0) {
rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
@@ -2979,6 +2980,7 @@ static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold
}
if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
break;
+ /* else: fall through */
case ASB_DISCARD_LEAST_CHG:
if (ch_self < ch_peer)
rv = -1;
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index d146fedc38bb..19cac36e9737 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -38,7 +38,7 @@ static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request
{
struct request_queue *q = device->rq_queue;
- generic_start_io_acct(q, bio_data_dir(req->master_bio),
+ generic_start_io_acct(q, bio_op(req->master_bio),
req->i.size >> 9, &device->vdisk->part0);
}
@@ -47,7 +47,7 @@ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *r
{
struct request_queue *q = device->rq_queue;
- generic_end_io_acct(q, bio_data_dir(req->master_bio),
+ generic_end_io_acct(q, bio_op(req->master_bio),
&device->vdisk->part0, req->start_jif);
}
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 5e793dd7adfb..b8f77e83d456 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1690,9 +1690,7 @@ void drbd_rs_controller_reset(struct drbd_device *device)
atomic_set(&device->rs_sect_in, 0);
atomic_set(&device->rs_sect_ev, 0);
device->rs_in_flight = 0;
- device->rs_last_events =
- (int)part_stat_read(&disk->part0, sectors[0]) +
- (int)part_stat_read(&disk->part0, sectors[1]);
+ device->rs_last_events = (int)part_stat_read_accum(&disk->part0, sectors);
/* Updating the RCU protected object in place is necessary since
this function gets called from atomic context.
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 8871b5044d9e..48f622728ce6 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -1461,7 +1461,6 @@ static void setup_rw_floppy(void)
int i;
int r;
int flags;
- int dflags;
unsigned long ready_date;
void (*function)(void);
@@ -1485,8 +1484,6 @@ static void setup_rw_floppy(void)
if (fd_wait_for_completion(ready_date, function))
return;
}
- dflags = DRS->flags;
-
if ((flags & FD_RAW_READ) || (flags & FD_RAW_WRITE))
setup_DMA();
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 4cb1d1be3cfb..ea9debf59b22 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -690,7 +690,6 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
unsigned int arg)
{
struct file *file, *old_file;
- struct inode *inode;
int error;
error = -ENXIO;
@@ -711,7 +710,6 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
if (error)
goto out_putf;
- inode = file->f_mapping->host;
old_file = lo->lo_backing_file;
error = -EINVAL;
@@ -1611,6 +1609,7 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
case LOOP_GET_STATUS64:
case LOOP_SET_STATUS64:
arg = (unsigned long) compat_ptr(arg);
+ /* fall through */
case LOOP_SET_FD:
case LOOP_CHANGE_FD:
case LOOP_SET_BLOCK_SIZE:
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index c73626decb46..db253cd5b32a 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2575,8 +2575,7 @@ static int mtip_hw_debugfs_init(struct driver_data *dd)
static void mtip_hw_debugfs_exit(struct driver_data *dd)
{
- if (dd->dfs_node)
- debugfs_remove_recursive(dd->dfs_node);
+ debugfs_remove_recursive(dd->dfs_node);
}
/*
diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
new file mode 100644
index 000000000000..d81781f22dba
--- /dev/null
+++ b/drivers/block/null_blk.h
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BLK_NULL_BLK_H
+#define __BLK_NULL_BLK_H
+
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/blk-mq.h>
+#include <linux/hrtimer.h>
+#include <linux/configfs.h>
+#include <linux/badblocks.h>
+#include <linux/fault-inject.h>
+
+struct nullb_cmd {
+ struct list_head list;
+ struct llist_node ll_list;
+ struct __call_single_data csd;
+ struct request *rq;
+ struct bio *bio;
+ unsigned int tag;
+ blk_status_t error;
+ struct nullb_queue *nq;
+ struct hrtimer timer;
+};
+
+struct nullb_queue {
+ unsigned long *tag_map;
+ wait_queue_head_t wait;
+ unsigned int queue_depth;
+ struct nullb_device *dev;
+ unsigned int requeue_selection;
+
+ struct nullb_cmd *cmds;
+};
+
+struct nullb_device {
+ struct nullb *nullb;
+ struct config_item item;
+ struct radix_tree_root data; /* data stored in the disk */
+ struct radix_tree_root cache; /* disk cache data */
+ unsigned long flags; /* device flags */
+ unsigned int curr_cache;
+ struct badblocks badblocks;
+
+ unsigned int nr_zones;
+ struct blk_zone *zones;
+ sector_t zone_size_sects;
+
+ unsigned long size; /* device size in MB */
+ unsigned long completion_nsec; /* time in ns to complete a request */
+ unsigned long cache_size; /* disk cache size in MB */
+ unsigned long zone_size; /* zone size in MB if device is zoned */
+ unsigned int submit_queues; /* number of submission queues */
+ unsigned int home_node; /* home node for the device */
+ unsigned int queue_mode; /* block interface */
+ unsigned int blocksize; /* block size */
+ unsigned int irqmode; /* IRQ completion handler */
+ unsigned int hw_queue_depth; /* queue depth */
+ unsigned int index; /* index of the disk, only valid with a disk */
+ unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
+ bool blocking; /* blocking blk-mq device */
+ bool use_per_node_hctx; /* use per-node allocation for hardware context */
+ bool power; /* power on/off the device */
+ bool memory_backed; /* if data is stored in memory */
+ bool discard; /* if support discard */
+ bool zoned; /* if device is zoned */
+};
+
+struct nullb {
+ struct nullb_device *dev;
+ struct list_head list;
+ unsigned int index;
+ struct request_queue *q;
+ struct gendisk *disk;
+ struct blk_mq_tag_set *tag_set;
+ struct blk_mq_tag_set __tag_set;
+ unsigned int queue_depth;
+ atomic_long_t cur_bytes;
+ struct hrtimer bw_timer;
+ unsigned long cache_flush_pos;
+ spinlock_t lock;
+
+ struct nullb_queue *queues;
+ unsigned int nr_queues;
+ char disk_name[DISK_NAME_LEN];
+};
+
+#ifdef CONFIG_BLK_DEV_ZONED
+int null_zone_init(struct nullb_device *dev);
+void null_zone_exit(struct nullb_device *dev);
+blk_status_t null_zone_report(struct nullb *nullb,
+ struct nullb_cmd *cmd);
+void null_zone_write(struct nullb_cmd *cmd);
+void null_zone_reset(struct nullb_cmd *cmd);
+#else
+static inline int null_zone_init(struct nullb_device *dev)
+{
+ return -EINVAL;
+}
+static inline void null_zone_exit(struct nullb_device *dev) {}
+static inline blk_status_t null_zone_report(struct nullb *nullb,
+ struct nullb_cmd *cmd)
+{
+ return BLK_STS_NOTSUPP;
+}
+static inline void null_zone_write(struct nullb_cmd *cmd) {}
+static inline void null_zone_reset(struct nullb_cmd *cmd) {}
+#endif /* CONFIG_BLK_DEV_ZONED */
+#endif /* __NULL_BLK_H */
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk_main.c
index 042c778e5a4e..6127e3ff7b4b 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk_main.c
@@ -7,14 +7,8 @@
#include <linux/moduleparam.h>
#include <linux/sched.h>
#include <linux/fs.h>
-#include <linux/blkdev.h>
#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/blk-mq.h>
-#include <linux/hrtimer.h>
-#include <linux/configfs.h>
-#include <linux/badblocks.h>
-#include <linux/fault-inject.h>
+#include "null_blk.h"
#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT)
@@ -35,28 +29,6 @@ static inline u64 mb_per_tick(int mbps)
return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
}
-struct nullb_cmd {
- struct list_head list;
- struct llist_node ll_list;
- struct __call_single_data csd;
- struct request *rq;
- struct bio *bio;
- unsigned int tag;
- blk_status_t error;
- struct nullb_queue *nq;
- struct hrtimer timer;
-};
-
-struct nullb_queue {
- unsigned long *tag_map;
- wait_queue_head_t wait;
- unsigned int queue_depth;
- struct nullb_device *dev;
- unsigned int requeue_selection;
-
- struct nullb_cmd *cmds;
-};
-
/*
* Status flags for nullb_device.
*
@@ -92,52 +64,6 @@ struct nullb_page {
#define NULLB_PAGE_LOCK (MAP_SZ - 1)
#define NULLB_PAGE_FREE (MAP_SZ - 2)
-struct nullb_device {
- struct nullb *nullb;
- struct config_item item;
- struct radix_tree_root data; /* data stored in the disk */
- struct radix_tree_root cache; /* disk cache data */
- unsigned long flags; /* device flags */
- unsigned int curr_cache;
- struct badblocks badblocks;
-
- unsigned long size; /* device size in MB */
- unsigned long completion_nsec; /* time in ns to complete a request */
- unsigned long cache_size; /* disk cache size in MB */
- unsigned int submit_queues; /* number of submission queues */
- unsigned int home_node; /* home node for the device */
- unsigned int queue_mode; /* block interface */
- unsigned int blocksize; /* block size */
- unsigned int irqmode; /* IRQ completion handler */
- unsigned int hw_queue_depth; /* queue depth */
- unsigned int index; /* index of the disk, only valid with a disk */
- unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
- bool blocking; /* blocking blk-mq device */
- bool use_per_node_hctx; /* use per-node allocation for hardware context */
- bool power; /* power on/off the device */
- bool memory_backed; /* if data is stored in memory */
- bool discard; /* if support discard */
-};
-
-struct nullb {
- struct nullb_device *dev;
- struct list_head list;
- unsigned int index;
- struct request_queue *q;
- struct gendisk *disk;
- struct blk_mq_tag_set *tag_set;
- struct blk_mq_tag_set __tag_set;
- unsigned int queue_depth;
- atomic_long_t cur_bytes;
- struct hrtimer bw_timer;
- unsigned long cache_flush_pos;
- spinlock_t lock;
-
- struct nullb_queue *queues;
- unsigned int nr_queues;
- char disk_name[DISK_NAME_LEN];
-};
-
static LIST_HEAD(nullb_list);
static struct mutex lock;
static int null_major;
@@ -254,6 +180,14 @@ static bool g_use_per_node_hctx;
module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444);
MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
+static bool g_zoned;
+module_param_named(zoned, g_zoned, bool, S_IRUGO);
+MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false");
+
+static unsigned long g_zone_size = 256;
+module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
+MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
+
static struct nullb_device *null_alloc_dev(void);
static void null_free_dev(struct nullb_device *dev);
static void null_del_dev(struct nullb *nullb);
@@ -357,6 +291,8 @@ NULLB_DEVICE_ATTR(memory_backed, bool);
NULLB_DEVICE_ATTR(discard, bool);
NULLB_DEVICE_ATTR(mbps, uint);
NULLB_DEVICE_ATTR(cache_size, ulong);
+NULLB_DEVICE_ATTR(zoned, bool);
+NULLB_DEVICE_ATTR(zone_size, ulong);
static ssize_t nullb_device_power_show(struct config_item *item, char *page)
{
@@ -390,6 +326,7 @@ static ssize_t nullb_device_power_store(struct config_item *item,
null_del_dev(dev->nullb);
mutex_unlock(&lock);
clear_bit(NULLB_DEV_FL_UP, &dev->flags);
+ clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags);
}
return count;
@@ -468,6 +405,8 @@ static struct configfs_attribute *nullb_device_attrs[] = {
&nullb_device_attr_mbps,
&nullb_device_attr_cache_size,
&nullb_device_attr_badblocks,
+ &nullb_device_attr_zoned,
+ &nullb_device_attr_zone_size,
NULL,
};
@@ -520,7 +459,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item)
static ssize_t memb_group_features_show(struct config_item *item, char *page)
{
- return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks\n");
+ return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size\n");
}
CONFIGFS_ATTR_RO(memb_group_, features);
@@ -579,6 +518,8 @@ static struct nullb_device *null_alloc_dev(void)
dev->hw_queue_depth = g_hw_queue_depth;
dev->blocking = g_blocking;
dev->use_per_node_hctx = g_use_per_node_hctx;
+ dev->zoned = g_zoned;
+ dev->zone_size = g_zone_size;
return dev;
}
@@ -587,6 +528,7 @@ static void null_free_dev(struct nullb_device *dev)
if (!dev)
return;
+ null_zone_exit(dev);
badblocks_exit(&dev->badblocks);
kfree(dev);
}
@@ -862,7 +804,9 @@ static struct nullb_page *null_lookup_page(struct nullb *nullb,
}
static struct nullb_page *null_insert_page(struct nullb *nullb,
- sector_t sector, bool ignore_cache)
+ sector_t sector, bool ignore_cache)
+ __releases(&nullb->lock)
+ __acquires(&nullb->lock)
{
u64 idx;
struct nullb_page *t_page;
@@ -1219,6 +1163,11 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
struct nullb *nullb = dev->nullb;
int err = 0;
+ if (req_op(cmd->rq) == REQ_OP_ZONE_REPORT) {
+ cmd->error = null_zone_report(nullb, cmd);
+ goto out;
+ }
+
if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) {
struct request *rq = cmd->rq;
@@ -1283,6 +1232,13 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
}
}
cmd->error = errno_to_blk_status(err);
+
+ if (!cmd->error && dev->zoned) {
+ if (req_op(cmd->rq) == REQ_OP_WRITE)
+ null_zone_write(cmd);
+ else if (req_op(cmd->rq) == REQ_OP_ZONE_RESET)
+ null_zone_reset(cmd);
+ }
out:
/* Complete IO by inline, softirq or timer */
switch (dev->irqmode) {
@@ -1810,6 +1766,15 @@ static int null_add_dev(struct nullb_device *dev)
blk_queue_flush_queueable(nullb->q, true);
}
+ if (dev->zoned) {
+ rv = null_zone_init(dev);
+ if (rv)
+ goto out_cleanup_blk_queue;
+
+ blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects);
+ nullb->q->limits.zoned = BLK_ZONED_HM;
+ }
+
nullb->q->queuedata = nullb;
blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q);
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q);
@@ -1828,13 +1793,16 @@ static int null_add_dev(struct nullb_device *dev)
rv = null_gendisk_register(nullb);
if (rv)
- goto out_cleanup_blk_queue;
+ goto out_cleanup_zone;
mutex_lock(&lock);
list_add_tail(&nullb->list, &nullb_list);
mutex_unlock(&lock);
return 0;
+out_cleanup_zone:
+ if (dev->zoned)
+ null_zone_exit(dev);
out_cleanup_blk_queue:
blk_cleanup_queue(nullb->q);
out_cleanup_tags:
@@ -1861,6 +1829,11 @@ static int __init null_init(void)
g_bs = PAGE_SIZE;
}
+ if (!is_power_of_2(g_zone_size)) {
+ pr_err("null_blk: zone_size must be power-of-two\n");
+ return -EINVAL;
+ }
+
if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
if (g_submit_queues != nr_online_nodes) {
pr_warn("null_blk: submit_queues param is set to %u.\n",
diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c
new file mode 100644
index 000000000000..a979ca00d7be
--- /dev/null
+++ b/drivers/block/null_blk_zoned.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/vmalloc.h>
+#include "null_blk.h"
+
+/* zone_size in MBs to sectors. */
+#define ZONE_SIZE_SHIFT 11
+
+static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect)
+{
+ return sect >> ilog2(dev->zone_size_sects);
+}
+
+int null_zone_init(struct nullb_device *dev)
+{
+ sector_t dev_size = (sector_t)dev->size * 1024 * 1024;
+ sector_t sector = 0;
+ unsigned int i;
+
+ if (!is_power_of_2(dev->zone_size)) {
+ pr_err("null_blk: zone_size must be power-of-two\n");
+ return -EINVAL;
+ }
+
+ dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT;
+ dev->nr_zones = dev_size >>
+ (SECTOR_SHIFT + ilog2(dev->zone_size_sects));
+ dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!dev->zones)
+ return -ENOMEM;
+
+ for (i = 0; i < dev->nr_zones; i++) {
+ struct blk_zone *zone = &dev->zones[i];
+
+ zone->start = zone->wp = sector;
+ zone->len = dev->zone_size_sects;
+ zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+ zone->cond = BLK_ZONE_COND_EMPTY;
+
+ sector += dev->zone_size_sects;
+ }
+
+ return 0;
+}
+
+void null_zone_exit(struct nullb_device *dev)
+{
+ kvfree(dev->zones);
+}
+
+static void null_zone_fill_rq(struct nullb_device *dev, struct request *rq,
+ unsigned int zno, unsigned int nr_zones)
+{
+ struct blk_zone_report_hdr *hdr = NULL;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+ void *addr;
+ unsigned int zones_to_cpy;
+
+ bio_for_each_segment(bvec, rq->bio, iter) {
+ addr = kmap_atomic(bvec.bv_page);
+
+ zones_to_cpy = bvec.bv_len / sizeof(struct blk_zone);
+
+ if (!hdr) {
+ hdr = (struct blk_zone_report_hdr *)addr;
+ hdr->nr_zones = nr_zones;
+ zones_to_cpy--;
+ addr += sizeof(struct blk_zone_report_hdr);
+ }
+
+ zones_to_cpy = min_t(unsigned int, zones_to_cpy, nr_zones);
+
+ memcpy(addr, &dev->zones[zno],
+ zones_to_cpy * sizeof(struct blk_zone));
+
+ kunmap_atomic(addr);
+
+ nr_zones -= zones_to_cpy;
+ zno += zones_to_cpy;
+
+ if (!nr_zones)
+ break;
+ }
+}
+
+blk_status_t null_zone_report(struct nullb *nullb,
+ struct nullb_cmd *cmd)
+{
+ struct nullb_device *dev = nullb->dev;
+ struct request *rq = cmd->rq;
+ unsigned int zno = null_zone_no(dev, blk_rq_pos(rq));
+ unsigned int nr_zones = dev->nr_zones - zno;
+ unsigned int max_zones = (blk_rq_bytes(rq) /
+ sizeof(struct blk_zone)) - 1;
+
+ nr_zones = min_t(unsigned int, nr_zones, max_zones);
+
+ null_zone_fill_rq(nullb->dev, rq, zno, nr_zones);
+
+ return BLK_STS_OK;
+}
+
+void null_zone_write(struct nullb_cmd *cmd)
+{
+ struct nullb_device *dev = cmd->nq->dev;
+ struct request *rq = cmd->rq;
+ sector_t sector = blk_rq_pos(rq);
+ unsigned int rq_sectors = blk_rq_sectors(rq);
+ unsigned int zno = null_zone_no(dev, sector);
+ struct blk_zone *zone = &dev->zones[zno];
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_FULL:
+ /* Cannot write to a full zone */
+ cmd->error = BLK_STS_IOERR;
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_IMP_OPEN:
+ /* Writes must be at the write pointer position */
+ if (blk_rq_pos(rq) != zone->wp) {
+ cmd->error = BLK_STS_IOERR;
+ break;
+ }
+
+ if (zone->cond == BLK_ZONE_COND_EMPTY)
+ zone->cond = BLK_ZONE_COND_IMP_OPEN;
+
+ zone->wp += rq_sectors;
+ if (zone->wp == zone->start + zone->len)
+ zone->cond = BLK_ZONE_COND_FULL;
+ break;
+ default:
+ /* Invalid zone condition */
+ cmd->error = BLK_STS_IOERR;
+ break;
+ }
+}
+
+void null_zone_reset(struct nullb_cmd *cmd)
+{
+ struct nullb_device *dev = cmd->nq->dev;
+ struct request *rq = cmd->rq;
+ unsigned int zno = null_zone_no(dev, blk_rq_pos(rq));
+ struct blk_zone *zone = &dev->zones[zno];
+
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ zone->wp = zone->start;
+}
diff --git a/drivers/block/paride/bpck.c b/drivers/block/paride/bpck.c
index 4f27e7392e38..f5f63ca2889d 100644
--- a/drivers/block/paride/bpck.c
+++ b/drivers/block/paride/bpck.c
@@ -347,7 +347,7 @@ static int bpck_test_proto( PIA *pi, char * scratch, int verbose )
static void bpck_read_eeprom ( PIA *pi, char * buf )
-{ int i,j,k,n,p,v,f, om, od;
+{ int i, j, k, p, v, f, om, od;
bpck_force_spp(pi);
@@ -356,7 +356,6 @@ static void bpck_read_eeprom ( PIA *pi, char * buf )
bpck_connect(pi);
- n = 0;
WR(4,0);
for (i=0;i<64;i++) {
WR(6,8);
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 8961b190e256..7cf947586fe4 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -426,6 +426,7 @@ static void run_fsm(void)
pd_claimed = 1;
if (!pi_schedule_claimed(pi_current, run_fsm))
return;
+ /* fall through */
case 1:
pd_claimed = 2;
pi_current->proto->connect(pi_current);
@@ -445,6 +446,7 @@ static void run_fsm(void)
spin_unlock_irqrestore(&pd_lock, saved_flags);
if (stop)
return;
+ /* fall through */
case Hold:
schedule_fsm();
return;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index b3f83cd96f33..e285413d4a75 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -67,7 +67,7 @@
#include <scsi/scsi.h>
#include <linux/debugfs.h>
#include <linux/device.h>
-
+#include <linux/nospec.h>
#include <linux/uaccess.h>
#define DRIVER_NAME "pktcdvd"
@@ -748,13 +748,13 @@ static const char *sense_key_string(__u8 index)
static void pkt_dump_sense(struct pktcdvd_device *pd,
struct packet_command *cgc)
{
- struct request_sense *sense = cgc->sense;
+ struct scsi_sense_hdr *sshdr = cgc->sshdr;
- if (sense)
+ if (sshdr)
pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n",
CDROM_PACKET_SIZE, cgc->cmd,
- sense->sense_key, sense->asc, sense->ascq,
- sense_key_string(sense->sense_key));
+ sshdr->sense_key, sshdr->asc, sshdr->ascq,
+ sense_key_string(sshdr->sense_key));
else
pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd);
}
@@ -787,18 +787,19 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd,
unsigned write_speed, unsigned read_speed)
{
struct packet_command cgc;
- struct request_sense sense;
+ struct scsi_sense_hdr sshdr;
int ret;
init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
- cgc.sense = &sense;
+ cgc.sshdr = &sshdr;
cgc.cmd[0] = GPCMD_SET_SPEED;
cgc.cmd[2] = (read_speed >> 8) & 0xff;
cgc.cmd[3] = read_speed & 0xff;
cgc.cmd[4] = (write_speed >> 8) & 0xff;
cgc.cmd[5] = write_speed & 0xff;
- if ((ret = pkt_generic_packet(pd, &cgc)))
+ ret = pkt_generic_packet(pd, &cgc);
+ if (ret)
pkt_dump_sense(pd, &cgc);
return ret;
@@ -1562,7 +1563,8 @@ static int pkt_get_disc_info(struct pktcdvd_device *pd, disc_information *di)
cgc.cmd[8] = cgc.buflen = 2;
cgc.quiet = 1;
- if ((ret = pkt_generic_packet(pd, &cgc)))
+ ret = pkt_generic_packet(pd, &cgc);
+ if (ret)
return ret;
/* not all drives have the same disc_info length, so requeue
@@ -1591,7 +1593,8 @@ static int pkt_get_track_info(struct pktcdvd_device *pd, __u16 track, __u8 type,
cgc.cmd[8] = 8;
cgc.quiet = 1;
- if ((ret = pkt_generic_packet(pd, &cgc)))
+ ret = pkt_generic_packet(pd, &cgc);
+ if (ret)
return ret;
cgc.buflen = be16_to_cpu(ti->track_information_length) +
@@ -1612,17 +1615,20 @@ static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd,
__u32 last_track;
int ret = -1;
- if ((ret = pkt_get_disc_info(pd, &di)))
+ ret = pkt_get_disc_info(pd, &di);
+ if (ret)
return ret;
last_track = (di.last_track_msb << 8) | di.last_track_lsb;
- if ((ret = pkt_get_track_info(pd, last_track, 1, &ti)))
+ ret = pkt_get_track_info(pd, last_track, 1, &ti);
+ if (ret)
return ret;
/* if this track is blank, try the previous. */
if (ti.blank) {
last_track--;
- if ((ret = pkt_get_track_info(pd, last_track, 1, &ti)))
+ ret = pkt_get_track_info(pd, last_track, 1, &ti);
+ if (ret)
return ret;
}
@@ -1645,7 +1651,7 @@ static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd,
static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
{
struct packet_command cgc;
- struct request_sense sense;
+ struct scsi_sense_hdr sshdr;
write_param_page *wp;
char buffer[128];
int ret, size;
@@ -1656,8 +1662,9 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
memset(buffer, 0, sizeof(buffer));
init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ);
- cgc.sense = &sense;
- if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) {
+ cgc.sshdr = &sshdr;
+ ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0);
+ if (ret) {
pkt_dump_sense(pd, &cgc);
return ret;
}
@@ -1671,8 +1678,9 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
* now get it all
*/
init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ);
- cgc.sense = &sense;
- if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) {
+ cgc.sshdr = &sshdr;
+ ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0);
+ if (ret) {
pkt_dump_sense(pd, &cgc);
return ret;
}
@@ -1714,7 +1722,8 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
wp->packet_size = cpu_to_be32(pd->settings.size >> 2);
cgc.buflen = cgc.cmd[8] = size;
- if ((ret = pkt_mode_select(pd, &cgc))) {
+ ret = pkt_mode_select(pd, &cgc);
+ if (ret) {
pkt_dump_sense(pd, &cgc);
return ret;
}
@@ -1819,7 +1828,8 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
memset(&di, 0, sizeof(disc_information));
memset(&ti, 0, sizeof(track_information));
- if ((ret = pkt_get_disc_info(pd, &di))) {
+ ret = pkt_get_disc_info(pd, &di);
+ if (ret) {
pkt_err(pd, "failed get_disc\n");
return ret;
}
@@ -1830,7 +1840,8 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
pd->type = di.erasable ? PACKET_CDRW : PACKET_CDR;
track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */
- if ((ret = pkt_get_track_info(pd, track, 1, &ti))) {
+ ret = pkt_get_track_info(pd, track, 1, &ti);
+ if (ret) {
pkt_err(pd, "failed get_track\n");
return ret;
}
@@ -1905,12 +1916,12 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd,
int set)
{
struct packet_command cgc;
- struct request_sense sense;
+ struct scsi_sense_hdr sshdr;
unsigned char buf[64];
int ret;
init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
- cgc.sense = &sense;
+ cgc.sshdr = &sshdr;
cgc.buflen = pd->mode_offset + 12;
/*
@@ -1918,7 +1929,8 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd,
*/
cgc.quiet = 1;
- if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0)))
+ ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0);
+ if (ret)
return ret;
buf[pd->mode_offset + 10] |= (!!set << 2);
@@ -1950,14 +1962,14 @@ static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd,
unsigned *write_speed)
{
struct packet_command cgc;
- struct request_sense sense;
+ struct scsi_sense_hdr sshdr;
unsigned char buf[256+18];
unsigned char *cap_buf;
int ret, offset;
cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset];
init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN);
- cgc.sense = &sense;
+ cgc.sshdr = &sshdr;
ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
if (ret) {
@@ -2011,13 +2023,13 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
unsigned *speed)
{
struct packet_command cgc;
- struct request_sense sense;
+ struct scsi_sense_hdr sshdr;
unsigned char buf[64];
unsigned int size, st, sp;
int ret;
init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ);
- cgc.sense = &sense;
+ cgc.sshdr = &sshdr;
cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
cgc.cmd[1] = 2;
cgc.cmd[2] = 4; /* READ ATIP */
@@ -2032,7 +2044,7 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
size = sizeof(buf);
init_cdrom_command(&cgc, buf, size, CGC_DATA_READ);
- cgc.sense = &sense;
+ cgc.sshdr = &sshdr;
cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
cgc.cmd[1] = 2;
cgc.cmd[2] = 4;
@@ -2083,17 +2095,18 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd)
{
struct packet_command cgc;
- struct request_sense sense;
+ struct scsi_sense_hdr sshdr;
int ret;
pkt_dbg(2, pd, "Performing OPC\n");
init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
- cgc.sense = &sense;
+ cgc.sshdr = &sshdr;
cgc.timeout = 60*HZ;
cgc.cmd[0] = GPCMD_SEND_OPC;
cgc.cmd[1] = 1;
- if ((ret = pkt_generic_packet(pd, &cgc)))
+ ret = pkt_generic_packet(pd, &cgc);
+ if (ret)
pkt_dump_sense(pd, &cgc);
return ret;
}
@@ -2103,19 +2116,22 @@ static int pkt_open_write(struct pktcdvd_device *pd)
int ret;
unsigned int write_speed, media_write_speed, read_speed;
- if ((ret = pkt_probe_settings(pd))) {
+ ret = pkt_probe_settings(pd);
+ if (ret) {
pkt_dbg(2, pd, "failed probe\n");
return ret;
}
- if ((ret = pkt_set_write_settings(pd))) {
+ ret = pkt_set_write_settings(pd);
+ if (ret) {
pkt_dbg(1, pd, "failed saving write settings\n");
return -EIO;
}
pkt_write_caching(pd, USE_WCACHING);
- if ((ret = pkt_get_max_speed(pd, &write_speed)))
+ ret = pkt_get_max_speed(pd, &write_speed);
+ if (ret)
write_speed = 16 * 177;
switch (pd->mmc3_profile) {
case 0x13: /* DVD-RW */
@@ -2124,7 +2140,8 @@ static int pkt_open_write(struct pktcdvd_device *pd)
pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed);
break;
default:
- if ((ret = pkt_media_speed(pd, &media_write_speed)))
+ ret = pkt_media_speed(pd, &media_write_speed);
+ if (ret)
media_write_speed = 16;
write_speed = min(write_speed, media_write_speed * 177);
pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176);
@@ -2132,14 +2149,16 @@ static int pkt_open_write(struct pktcdvd_device *pd)
}
read_speed = write_speed;
- if ((ret = pkt_set_speed(pd, write_speed, read_speed))) {
+ ret = pkt_set_speed(pd, write_speed, read_speed);
+ if (ret) {
pkt_dbg(1, pd, "couldn't set write speed\n");
return -EIO;
}
pd->write_speed = write_speed;
pd->read_speed = read_speed;
- if ((ret = pkt_perform_opc(pd))) {
+ ret = pkt_perform_opc(pd);
+ if (ret) {
pkt_dbg(1, pd, "Optimum Power Calibration failed\n");
}
@@ -2161,10 +2180,12 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
* so bdget() can't fail.
*/
bdget(pd->bdev->bd_dev);
- if ((ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd)))
+ ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd);
+ if (ret)
goto out;
- if ((ret = pkt_get_last_written(pd, &lba))) {
+ ret = pkt_get_last_written(pd, &lba);
+ if (ret) {
pkt_err(pd, "pkt_get_last_written failed\n");
goto out_putdev;
}
@@ -2175,7 +2196,8 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
q = bdev_get_queue(pd->bdev);
if (write) {
- if ((ret = pkt_open_write(pd)))
+ ret = pkt_open_write(pd);
+ if (ret)
goto out_putdev;
/*
* Some CDRW drives can not handle writes larger than one packet,
@@ -2190,7 +2212,8 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
clear_bit(PACKET_WRITABLE, &pd->flags);
}
- if ((ret = pkt_set_segment_merging(pd, q)))
+ ret = pkt_set_segment_merging(pd, q);
+ if (ret)
goto out_putdev;
if (write) {
@@ -2231,6 +2254,8 @@ static struct pktcdvd_device *pkt_find_dev_from_minor(unsigned int dev_minor)
{
if (dev_minor >= MAX_WRITERS)
return NULL;
+
+ dev_minor = array_index_nospec(dev_minor, MAX_WRITERS);
return pkt_devs[dev_minor];
}
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index dddb3f2490b6..1a92f9e65937 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -112,7 +112,7 @@ static const struct block_device_operations rsxx_fops = {
static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio)
{
- generic_start_io_acct(card->queue, bio_data_dir(bio), bio_sectors(bio),
+ generic_start_io_acct(card->queue, bio_op(bio), bio_sectors(bio),
&card->gendisk->part0);
}
@@ -120,8 +120,8 @@ static void disk_stats_complete(struct rsxx_cardinfo *card,
struct bio *bio,
unsigned long start_time)
{
- generic_end_io_acct(card->queue, bio_data_dir(bio),
- &card->gendisk->part0, start_time);
+ generic_end_io_acct(card->queue, bio_op(bio),
+ &card->gendisk->part0, start_time);
}
static void bio_dma_done_cb(struct rsxx_cardinfo *card,
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index bc7aea6d7b7c..87b9e7fbf062 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -657,8 +657,8 @@ static bool skd_preop_sg_list(struct skd_device *skdev,
if (unlikely(skdev->dbg_level > 1)) {
dev_dbg(&skdev->pdev->dev,
- "skreq=%x sksg_list=%p sksg_dma=%llx\n",
- skreq->id, skreq->sksg_list, skreq->sksg_dma_address);
+ "skreq=%x sksg_list=%p sksg_dma=%pad\n",
+ skreq->id, skreq->sksg_list, &skreq->sksg_dma_address);
for (i = 0; i < n_sg; i++) {
struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
@@ -1190,8 +1190,8 @@ static void skd_send_fitmsg(struct skd_device *skdev,
{
u64 qcmd;
- dev_dbg(&skdev->pdev->dev, "dma address 0x%llx, busy=%d\n",
- skmsg->mb_dma_address, skd_in_flight(skdev));
+ dev_dbg(&skdev->pdev->dev, "dma address %pad, busy=%d\n",
+ &skmsg->mb_dma_address, skd_in_flight(skdev));
dev_dbg(&skdev->pdev->dev, "msg_buf %p\n", skmsg->msg_buf);
qcmd = skmsg->mb_dma_address;
@@ -1250,9 +1250,9 @@ static void skd_send_special_fitmsg(struct skd_device *skdev,
}
dev_dbg(&skdev->pdev->dev,
- "skspcl=%p id=%04x sksg_list=%p sksg_dma=%llx\n",
+ "skspcl=%p id=%04x sksg_list=%p sksg_dma=%pad\n",
skspcl, skspcl->req.id, skspcl->req.sksg_list,
- skspcl->req.sksg_dma_address);
+ &skspcl->req.sksg_dma_address);
for (i = 0; i < skspcl->req.n_sg; i++) {
struct fit_sg_descriptor *sgd =
&skspcl->req.sksg_list[i];
@@ -2685,8 +2685,8 @@ static int skd_cons_skmsg(struct skd_device *skdev)
WARN(((uintptr_t)skmsg->msg_buf | skmsg->mb_dma_address) &
(FIT_QCMD_ALIGN - 1),
- "not aligned: msg_buf %p mb_dma_address %#llx\n",
- skmsg->msg_buf, skmsg->mb_dma_address);
+ "not aligned: msg_buf %p mb_dma_address %pad\n",
+ skmsg->msg_buf, &skmsg->mb_dma_address);
memset(skmsg->msg_buf, 0, SKD_N_FITMSG_BYTES);
}
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index b5cedccb5d7d..8986adab9bf5 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -251,14 +251,9 @@ static DEFINE_SPINLOCK(minor_lock);
#define GRANTS_PER_INDIRECT_FRAME \
(XEN_PAGE_SIZE / sizeof(struct blkif_request_segment))
-#define PSEGS_PER_INDIRECT_FRAME \
- (GRANTS_INDIRECT_FRAME / GRANTS_PSEGS)
-
#define INDIRECT_GREFS(_grants) \
DIV_ROUND_UP(_grants, GRANTS_PER_INDIRECT_FRAME)
-#define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG)
-
static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo);
static void blkfront_gather_backend_features(struct blkfront_info *info);
static int negotiate_mq(struct blkfront_info *info);
@@ -1441,7 +1436,7 @@ static bool blkif_completion(unsigned long *id,
/* Wait the second response if not yet here. */
if (s2->status == REQ_WAITING)
- return 0;
+ return false;
bret->status = blkif_get_final_status(s->status,
s2->status);
@@ -1542,7 +1537,7 @@ static bool blkif_completion(unsigned long *id,
}
}
- return 1;
+ return true;
}
static irqreturn_t blkif_interrupt(int irq, void *dev_id)
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index a390c6d4f72d..c7acf74253a1 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1287,17 +1287,16 @@ static void zram_bio_discard(struct zram *zram, u32 index,
* Returns 1 if IO request was successfully submitted.
*/
static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
- int offset, bool is_write, struct bio *bio)
+ int offset, unsigned int op, struct bio *bio)
{
unsigned long start_time = jiffies;
- int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ;
struct request_queue *q = zram->disk->queue;
int ret;
- generic_start_io_acct(q, rw_acct, bvec->bv_len >> SECTOR_SHIFT,
+ generic_start_io_acct(q, op, bvec->bv_len >> SECTOR_SHIFT,
&zram->disk->part0);
- if (!is_write) {
+ if (!op_is_write(op)) {
atomic64_inc(&zram->stats.num_reads);
ret = zram_bvec_read(zram, bvec, index, offset, bio);
flush_dcache_page(bvec->bv_page);
@@ -1306,14 +1305,14 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
ret = zram_bvec_write(zram, bvec, index, offset, bio);
}
- generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time);
+ generic_end_io_acct(q, op, &zram->disk->part0, start_time);
zram_slot_lock(zram, index);
zram_accessed(zram, index);
zram_slot_unlock(zram, index);
if (unlikely(ret < 0)) {
- if (!is_write)
+ if (!op_is_write(op))
atomic64_inc(&zram->stats.failed_reads);
else
atomic64_inc(&zram->stats.failed_writes);
@@ -1351,7 +1350,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
unwritten);
if (zram_bvec_rw(zram, &bv, index, offset,
- op_is_write(bio_op(bio)), bio) < 0)
+ bio_op(bio), bio) < 0)
goto out;
bv.bv_offset += bv.bv_len;
@@ -1403,7 +1402,7 @@ static void zram_slot_free_notify(struct block_device *bdev,
}
static int zram_rw_page(struct block_device *bdev, sector_t sector,
- struct page *page, bool is_write)
+ struct page *page, unsigned int op)
{
int offset, ret;
u32 index;
@@ -1427,7 +1426,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
bv.bv_len = PAGE_SIZE;
bv.bv_offset = 0;
- ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL);
+ ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL);
out:
/*
* If I/O fails, just return error(ie, non-zero) without
@@ -1442,7 +1441,7 @@ out:
switch (ret) {
case 0:
- page_endio(page, is_write, 0);
+ page_endio(page, op_is_write(op), 0);
break;
case 1:
ret = 0;
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index a78b8e7085e9..113fc6edb2b0 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -282,6 +282,7 @@
#include <linux/blkdev.h>
#include <linux/times.h>
#include <linux/uaccess.h>
+#include <scsi/scsi_common.h>
#include <scsi/scsi_request.h>
/* used to tell the module to turn on full debugging messages */
@@ -345,10 +346,10 @@ static LIST_HEAD(cdrom_list);
int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi,
struct packet_command *cgc)
{
- if (cgc->sense) {
- cgc->sense->sense_key = 0x05;
- cgc->sense->asc = 0x20;
- cgc->sense->ascq = 0x00;
+ if (cgc->sshdr) {
+ cgc->sshdr->sense_key = 0x05;
+ cgc->sshdr->asc = 0x20;
+ cgc->sshdr->ascq = 0x00;
}
cgc->stat = -EIO;
@@ -2222,9 +2223,12 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
blk_execute_rq(q, cdi->disk, rq, 0);
if (scsi_req(rq)->result) {
- struct request_sense *s = req->sense;
+ struct scsi_sense_hdr sshdr;
+
ret = -EIO;
- cdi->last_sense = s->sense_key;
+ scsi_normalize_sense(req->sense, req->sense_len,
+ &sshdr);
+ cdi->last_sense = sshdr.sense_key;
}
if (blk_rq_unmap_user(bio))
@@ -2943,7 +2947,7 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
struct packet_command *cgc,
int cmd)
{
- struct request_sense sense;
+ struct scsi_sense_hdr sshdr;
struct cdrom_msf msf;
int blocksize = 0, format = 0, lba;
int ret;
@@ -2971,13 +2975,13 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
if (cgc->buffer == NULL)
return -ENOMEM;
- memset(&sense, 0, sizeof(sense));
- cgc->sense = &sense;
+ memset(&sshdr, 0, sizeof(sshdr));
+ cgc->sshdr = &sshdr;
cgc->data_direction = CGC_DATA_READ;
ret = cdrom_read_block(cdi, cgc, lba, 1, format, blocksize);
- if (ret && sense.sense_key == 0x05 &&
- sense.asc == 0x20 &&
- sense.ascq == 0x00) {
+ if (ret && sshdr.sense_key == 0x05 &&
+ sshdr.asc == 0x20 &&
+ sshdr.ascq == 0x00) {
/*
* SCSI-II devices are not required to support
* READ_CD, so let's try switching block size
@@ -2986,7 +2990,7 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
ret = cdrom_switch_blocksize(cdi, blocksize);
if (ret)
goto out;
- cgc->sense = NULL;
+ cgc->sshdr = NULL;
ret = cdrom_read_cd(cdi, cgc, lba, blocksize, 1);
ret |= cdrom_switch_blocksize(cdi, blocksize);
}
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 5f178384876f..44a7a255ef74 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -419,10 +419,11 @@ static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd)
int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
int write, void *buffer, unsigned *bufflen,
- struct request_sense *sense, int timeout,
+ struct scsi_sense_hdr *sshdr, int timeout,
req_flags_t rq_flags)
{
struct cdrom_info *info = drive->driver_data;
+ struct scsi_sense_hdr local_sshdr;
int retries = 10;
bool failed;
@@ -430,6 +431,9 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
"rq_flags: 0x%x",
cmd[0], write, timeout, rq_flags);
+ if (!sshdr)
+ sshdr = &local_sshdr;
+
/* start of retry loop */
do {
struct request *rq;
@@ -456,8 +460,8 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
if (buffer)
*bufflen = scsi_req(rq)->resid_len;
- if (sense)
- memcpy(sense, scsi_req(rq)->sense, sizeof(*sense));
+ scsi_normalize_sense(scsi_req(rq)->sense,
+ scsi_req(rq)->sense_len, sshdr);
/*
* FIXME: we should probably abort/retry or something in case of
@@ -469,12 +473,10 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
* The request failed. Retry if it was due to a unit
* attention status (usually means media was changed).
*/
- struct request_sense *reqbuf = scsi_req(rq)->sense;
-
- if (reqbuf->sense_key == UNIT_ATTENTION)
+ if (sshdr->sense_key == UNIT_ATTENTION)
cdrom_saw_media_change(drive);
- else if (reqbuf->sense_key == NOT_READY &&
- reqbuf->asc == 4 && reqbuf->ascq != 4) {
+ else if (sshdr->sense_key == NOT_READY &&
+ sshdr->asc == 4 && sshdr->ascq != 4) {
/*
* The drive is in the process of loading
* a disk. Retry, but wait a little to give
@@ -864,7 +866,7 @@ static void msf_from_bcd(struct atapi_msf *msf)
msf->frame = bcd2bin(msf->frame);
}
-int cdrom_check_status(ide_drive_t *drive, struct request_sense *sense)
+int cdrom_check_status(ide_drive_t *drive, struct scsi_sense_hdr *sshdr)
{
struct cdrom_info *info = drive->driver_data;
struct cdrom_device_info *cdi;
@@ -886,12 +888,11 @@ int cdrom_check_status(ide_drive_t *drive, struct request_sense *sense)
*/
cmd[7] = cdi->sanyo_slot % 3;
- return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sense, 0, RQF_QUIET);
+ return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sshdr, 0, RQF_QUIET);
}
static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity,
- unsigned long *sectors_per_frame,
- struct request_sense *sense)
+ unsigned long *sectors_per_frame)
{
struct {
__be32 lba;
@@ -908,7 +909,7 @@ static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity,
memset(cmd, 0, BLK_MAX_CDB);
cmd[0] = GPCMD_READ_CDVD_CAPACITY;
- stat = ide_cd_queue_pc(drive, cmd, 0, &capbuf, &len, sense, 0,
+ stat = ide_cd_queue_pc(drive, cmd, 0, &capbuf, &len, NULL, 0,
RQF_QUIET);
if (stat)
return stat;
@@ -944,8 +945,7 @@ static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity,
}
static int cdrom_read_tocentry(ide_drive_t *drive, int trackno, int msf_flag,
- int format, char *buf, int buflen,
- struct request_sense *sense)
+ int format, char *buf, int buflen)
{
unsigned char cmd[BLK_MAX_CDB];
@@ -962,11 +962,11 @@ static int cdrom_read_tocentry(ide_drive_t *drive, int trackno, int msf_flag,
if (msf_flag)
cmd[1] = 2;
- return ide_cd_queue_pc(drive, cmd, 0, buf, &buflen, sense, 0, RQF_QUIET);
+ return ide_cd_queue_pc(drive, cmd, 0, buf, &buflen, NULL, 0, RQF_QUIET);
}
/* Try to read the entire TOC for the disk into our internal buffer. */
-int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
+int ide_cd_read_toc(ide_drive_t *drive)
{
int stat, ntracks, i;
struct cdrom_info *info = drive->driver_data;
@@ -996,14 +996,13 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
* Check to see if the existing data is still valid. If it is,
* just return.
*/
- (void) cdrom_check_status(drive, sense);
+ (void) cdrom_check_status(drive, NULL);
if (drive->atapi_flags & IDE_AFLAG_TOC_VALID)
return 0;
/* try to get the total cdrom capacity and sector size */
- stat = cdrom_read_capacity(drive, &toc->capacity, &sectors_per_frame,
- sense);
+ stat = cdrom_read_capacity(drive, &toc->capacity, &sectors_per_frame);
if (stat)
toc->capacity = 0x1fffff;
@@ -1016,7 +1015,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
/* first read just the header, so we know how long the TOC is */
stat = cdrom_read_tocentry(drive, 0, 1, 0, (char *) &toc->hdr,
- sizeof(struct atapi_toc_header), sense);
+ sizeof(struct atapi_toc_header));
if (stat)
return stat;
@@ -1036,7 +1035,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
(char *)&toc->hdr,
sizeof(struct atapi_toc_header) +
(ntracks + 1) *
- sizeof(struct atapi_toc_entry), sense);
+ sizeof(struct atapi_toc_entry));
if (stat && toc->hdr.first_track > 1) {
/*
@@ -1056,8 +1055,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
(char *)&toc->hdr,
sizeof(struct atapi_toc_header) +
(ntracks + 1) *
- sizeof(struct atapi_toc_entry),
- sense);
+ sizeof(struct atapi_toc_entry));
if (stat)
return stat;
@@ -1094,7 +1092,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
if (toc->hdr.first_track != CDROM_LEADOUT) {
/* read the multisession information */
stat = cdrom_read_tocentry(drive, 0, 0, 1, (char *)&ms_tmp,
- sizeof(ms_tmp), sense);
+ sizeof(ms_tmp));
if (stat)
return stat;
@@ -1108,7 +1106,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
if (drive->atapi_flags & IDE_AFLAG_TOCADDR_AS_BCD) {
/* re-read multisession information using MSF format */
stat = cdrom_read_tocentry(drive, 0, 1, 1, (char *)&ms_tmp,
- sizeof(ms_tmp), sense);
+ sizeof(ms_tmp));
if (stat)
return stat;
@@ -1412,7 +1410,7 @@ static sector_t ide_cdrom_capacity(ide_drive_t *drive)
{
unsigned long capacity, sectors_per_frame;
- if (cdrom_read_capacity(drive, &capacity, &sectors_per_frame, NULL))
+ if (cdrom_read_capacity(drive, &capacity, &sectors_per_frame))
return 0;
return capacity * sectors_per_frame;
@@ -1710,9 +1708,8 @@ static unsigned int idecd_check_events(struct gendisk *disk,
static int idecd_revalidate_disk(struct gendisk *disk)
{
struct cdrom_info *info = ide_drv_g(disk, cdrom_info);
- struct request_sense sense;
- ide_cd_read_toc(info->drive, &sense);
+ ide_cd_read_toc(info->drive);
return 0;
}
@@ -1736,7 +1733,6 @@ static int ide_cd_probe(ide_drive_t *drive)
{
struct cdrom_info *info;
struct gendisk *g;
- struct request_sense sense;
ide_debug_log(IDE_DBG_PROBE, "driver_req: %s, media: 0x%x",
drive->driver_req, drive->media);
@@ -1785,7 +1781,7 @@ static int ide_cd_probe(ide_drive_t *drive)
goto failed;
}
- ide_cd_read_toc(drive, &sense);
+ ide_cd_read_toc(drive);
g->fops = &idecd_ops;
g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
device_add_disk(&drive->gendev, g);
diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h
index 04f0f310a856..a69dc7f61c4d 100644
--- a/drivers/ide/ide-cd.h
+++ b/drivers/ide/ide-cd.h
@@ -98,11 +98,11 @@ void ide_cd_log_error(const char *, struct request *, struct request_sense *);
/* ide-cd.c functions used by ide-cd_ioctl.c */
int ide_cd_queue_pc(ide_drive_t *, const unsigned char *, int, void *,
- unsigned *, struct request_sense *, int, req_flags_t);
-int ide_cd_read_toc(ide_drive_t *, struct request_sense *);
+ unsigned *, struct scsi_sense_hdr *, int, req_flags_t);
+int ide_cd_read_toc(ide_drive_t *);
int ide_cdrom_get_capabilities(ide_drive_t *, u8 *);
void ide_cdrom_update_speed(ide_drive_t *, u8 *);
-int cdrom_check_status(ide_drive_t *, struct request_sense *);
+int cdrom_check_status(ide_drive_t *, struct scsi_sense_hdr *);
/* ide-cd_ioctl.c */
int ide_cdrom_open_real(struct cdrom_device_info *, int);
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index b1322400887b..4a6e1a413ead 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -43,14 +43,14 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr)
{
ide_drive_t *drive = cdi->handle;
struct media_event_desc med;
- struct request_sense sense;
+ struct scsi_sense_hdr sshdr;
int stat;
if (slot_nr != CDSL_CURRENT)
return -EINVAL;
- stat = cdrom_check_status(drive, &sense);
- if (!stat || sense.sense_key == UNIT_ATTENTION)
+ stat = cdrom_check_status(drive, &sshdr);
+ if (!stat || sshdr.sense_key == UNIT_ATTENTION)
return CDS_DISC_OK;
if (!cdrom_get_media_event(cdi, &med)) {
@@ -62,8 +62,8 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr)
return CDS_NO_DISC;
}
- if (sense.sense_key == NOT_READY && sense.asc == 0x04
- && sense.ascq == 0x04)
+ if (sshdr.sense_key == NOT_READY && sshdr.asc == 0x04
+ && sshdr.ascq == 0x04)
return CDS_DISC_OK;
/*
@@ -71,8 +71,8 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr)
* just return TRAY_OPEN since ATAPI doesn't provide
* any other way to detect this...
*/
- if (sense.sense_key == NOT_READY) {
- if (sense.asc == 0x3a && sense.ascq == 1)
+ if (sshdr.sense_key == NOT_READY) {
+ if (sshdr.asc == 0x3a && sshdr.ascq == 1)
return CDS_NO_DISC;
else
return CDS_TRAY_OPEN;
@@ -105,8 +105,7 @@ unsigned int ide_cdrom_check_events_real(struct cdrom_device_info *cdi,
/* Eject the disk if EJECTFLAG is 0.
If EJECTFLAG is 1, try to reload the disk. */
static
-int cdrom_eject(ide_drive_t *drive, int ejectflag,
- struct request_sense *sense)
+int cdrom_eject(ide_drive_t *drive, int ejectflag)
{
struct cdrom_info *cd = drive->driver_data;
struct cdrom_device_info *cdi = &cd->devinfo;
@@ -129,20 +128,16 @@ int cdrom_eject(ide_drive_t *drive, int ejectflag,
cmd[0] = GPCMD_START_STOP_UNIT;
cmd[4] = loej | (ejectflag != 0);
- return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sense, 0, 0);
+ return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0);
}
/* Lock the door if LOCKFLAG is nonzero; unlock it otherwise. */
static
-int ide_cd_lockdoor(ide_drive_t *drive, int lockflag,
- struct request_sense *sense)
+int ide_cd_lockdoor(ide_drive_t *drive, int lockflag)
{
- struct request_sense my_sense;
+ struct scsi_sense_hdr sshdr;
int stat;
- if (sense == NULL)
- sense = &my_sense;
-
/* If the drive cannot lock the door, just pretend. */
if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0) {
stat = 0;
@@ -155,14 +150,14 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag,
cmd[4] = lockflag ? 1 : 0;
stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL,
- sense, 0, 0);
+ &sshdr, 0, 0);
}
/* If we got an illegal field error, the drive
probably cannot lock the door. */
if (stat != 0 &&
- sense->sense_key == ILLEGAL_REQUEST &&
- (sense->asc == 0x24 || sense->asc == 0x20)) {
+ sshdr.sense_key == ILLEGAL_REQUEST &&
+ (sshdr.asc == 0x24 || sshdr.asc == 0x20)) {
printk(KERN_ERR "%s: door locking not supported\n",
drive->name);
drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
@@ -170,7 +165,7 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag,
}
/* no medium, that's alright. */
- if (stat != 0 && sense->sense_key == NOT_READY && sense->asc == 0x3a)
+ if (stat != 0 && sshdr.sense_key == NOT_READY && sshdr.asc == 0x3a)
stat = 0;
if (stat == 0) {
@@ -186,23 +181,22 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag,
int ide_cdrom_tray_move(struct cdrom_device_info *cdi, int position)
{
ide_drive_t *drive = cdi->handle;
- struct request_sense sense;
if (position) {
- int stat = ide_cd_lockdoor(drive, 0, &sense);
+ int stat = ide_cd_lockdoor(drive, 0);
if (stat)
return stat;
}
- return cdrom_eject(drive, !position, &sense);
+ return cdrom_eject(drive, !position);
}
int ide_cdrom_lock_door(struct cdrom_device_info *cdi, int lock)
{
ide_drive_t *drive = cdi->handle;
- return ide_cd_lockdoor(drive, lock, NULL);
+ return ide_cd_lockdoor(drive, lock);
}
/*
@@ -213,7 +207,6 @@ int ide_cdrom_select_speed(struct cdrom_device_info *cdi, int speed)
{
ide_drive_t *drive = cdi->handle;
struct cdrom_info *cd = drive->driver_data;
- struct request_sense sense;
u8 buf[ATAPI_CAPABILITIES_PAGE_SIZE];
int stat;
unsigned char cmd[BLK_MAX_CDB];
@@ -236,7 +229,7 @@ int ide_cdrom_select_speed(struct cdrom_device_info *cdi, int speed)
cmd[5] = speed & 0xff;
}
- stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, &sense, 0, 0);
+ stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0);
if (!ide_cdrom_get_capabilities(drive, buf)) {
ide_cdrom_update_speed(drive, buf);
@@ -252,11 +245,10 @@ int ide_cdrom_get_last_session(struct cdrom_device_info *cdi,
struct atapi_toc *toc;
ide_drive_t *drive = cdi->handle;
struct cdrom_info *info = drive->driver_data;
- struct request_sense sense;
int ret;
if ((drive->atapi_flags & IDE_AFLAG_TOC_VALID) == 0 || !info->toc) {
- ret = ide_cd_read_toc(drive, &sense);
+ ret = ide_cd_read_toc(drive);
if (ret)
return ret;
}
@@ -300,7 +292,6 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
{
ide_drive_t *drive = cdi->handle;
struct cdrom_info *cd = drive->driver_data;
- struct request_sense sense;
struct request *rq;
int ret;
@@ -315,7 +306,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
* lock it again.
*/
if (drive->atapi_flags & IDE_AFLAG_DOOR_LOCKED)
- (void)ide_cd_lockdoor(drive, 1, &sense);
+ (void)ide_cd_lockdoor(drive, 1);
return ret;
}
@@ -355,7 +346,6 @@ static int ide_cd_fake_play_trkind(ide_drive_t *drive, void *arg)
struct atapi_toc_entry *first_toc, *last_toc;
unsigned long lba_start, lba_end;
int stat;
- struct request_sense sense;
unsigned char cmd[BLK_MAX_CDB];
stat = ide_cd_get_toc_entry(drive, ti->cdti_trk0, &first_toc);
@@ -380,7 +370,7 @@ static int ide_cd_fake_play_trkind(ide_drive_t *drive, void *arg)
lba_to_msf(lba_start, &cmd[3], &cmd[4], &cmd[5]);
lba_to_msf(lba_end - 1, &cmd[6], &cmd[7], &cmd[8]);
- return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, &sense, 0, 0);
+ return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0);
}
static int ide_cd_read_tochdr(ide_drive_t *drive, void *arg)
@@ -391,7 +381,7 @@ static int ide_cd_read_tochdr(ide_drive_t *drive, void *arg)
int stat;
/* Make sure our saved TOC is valid. */
- stat = ide_cd_read_toc(drive, NULL);
+ stat = ide_cd_read_toc(drive);
if (stat)
return stat;
@@ -461,8 +451,8 @@ int ide_cdrom_packet(struct cdrom_device_info *cdi,
layer. the packet must be complete, as we do not
touch it at all. */
- if (cgc->sense)
- memset(cgc->sense, 0, sizeof(struct request_sense));
+ if (cgc->sshdr)
+ memset(cgc->sshdr, 0, sizeof(*cgc->sshdr));
if (cgc->quiet)
flags |= RQF_QUIET;
@@ -470,7 +460,7 @@ int ide_cdrom_packet(struct cdrom_device_info *cdi,
cgc->stat = ide_cd_queue_pc(drive, cgc->cmd,
cgc->data_direction == CGC_DATA_WRITE,
cgc->buffer, &len,
- cgc->sense, cgc->timeout, flags);
+ cgc->sshdr, cgc->timeout, flags);
if (!cgc->stat)
cgc->buflen -= len;
return cgc->stat;
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index ca844a926e6a..130bf163f066 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -311,7 +311,7 @@ iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
{
domain->sig_type = IB_SIG_TYPE_T10_DIF;
domain->sig.dif.pi_interval = scsi_prot_interval(sc);
- domain->sig.dif.ref_tag = scsi_prot_ref_tag(sc);
+ domain->sig.dif.ref_tag = t10_pi_ref_tag(sc->request);
/*
* At the moment we hard code those, but in the future
* we will take them from sc.
diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
index 9c03f35d9df1..439bf90d084d 100644
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -17,23 +17,25 @@ menuconfig NVM
if NVM
-config NVM_DEBUG
- bool "Open-Channel SSD debugging support"
- default n
- ---help---
- Exposes a debug management interface to create/remove targets at:
+config NVM_PBLK
+ tristate "Physical Block Device Open-Channel SSD target"
+ help
+ Allows an open-channel SSD to be exposed as a block device to the
+ host. The target assumes the device exposes raw flash and must be
+ explicitly managed by the host.
- /sys/module/lnvm/parameters/configure_debug
+ Please note the disk format is considered EXPERIMENTAL for now.
- It is required to create/remove targets without IOCTLs.
+if NVM_PBLK
-config NVM_PBLK
- tristate "Physical Block Device Open-Channel SSD target"
- ---help---
- Allows an open-channel SSD to be exposed as a block device to the
- host. The target assumes the device exposes raw flash and must be
- explicitly managed by the host.
+config NVM_PBLK_DEBUG
+ bool "PBlk Debug Support"
+ default n
+ help
+ Enables debug support for pblk. This includes extra checks, more
+ vocal error messages, and extra tracking fields in the pblk sysfs
+ entries.
- Please note the disk format is considered EXPERIMENTAL for now.
+endif # NVM_PBLK_DEBUG
endif # NVM
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
index b1c6d7eb6115..f565a56b898a 100644
--- a/drivers/lightnvm/pblk-cache.c
+++ b/drivers/lightnvm/pblk-cache.c
@@ -27,7 +27,8 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
int nr_entries = pblk_get_secs(bio);
int i, ret;
- generic_start_io_acct(q, WRITE, bio_sectors(bio), &pblk->disk->part0);
+ generic_start_io_acct(q, REQ_OP_WRITE, bio_sectors(bio),
+ &pblk->disk->part0);
/* Update the write buffer head (mem) with the entries that we can
* write. The write in itself cannot fail, so there is no need to
@@ -67,7 +68,7 @@ retry:
atomic64_add(nr_entries, &pblk->user_wa);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(nr_entries, &pblk->inflight_writes);
atomic_long_add(nr_entries, &pblk->req_writes);
#endif
@@ -75,7 +76,7 @@ retry:
pblk_rl_inserted(&pblk->rl, nr_entries);
out:
- generic_end_io_acct(q, WRITE, &pblk->disk->part0, start_time);
+ generic_end_io_acct(q, REQ_OP_WRITE, &pblk->disk->part0, start_time);
pblk_write_should_kick(pblk);
return ret;
}
@@ -123,7 +124,7 @@ retry:
atomic64_add(valid_entries, &pblk->gc_wa);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(valid_entries, &pblk->inflight_writes);
atomic_long_add(valid_entries, &pblk->recov_gc_writes);
#endif
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index ed9cc977c8b3..00984b486fea 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -35,7 +35,7 @@ static void pblk_line_mark_bb(struct work_struct *work)
line = &pblk->lines[pblk_ppa_to_line(*ppa)];
pos = pblk_ppa_to_pos(&dev->geo, *ppa);
- pr_err("pblk: failed to mark bb, line:%d, pos:%d\n",
+ pblk_err(pblk, "failed to mark bb, line:%d, pos:%d\n",
line->id, pos);
}
@@ -51,12 +51,12 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
struct ppa_addr *ppa;
int pos = pblk_ppa_to_pos(geo, ppa_addr);
- pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos);
+ pblk_debug(pblk, "erase failed: line:%d, pos:%d\n", line->id, pos);
atomic_long_inc(&pblk->erase_failed);
atomic_dec(&line->blk_in_line);
if (test_and_set_bit(pos, line->blk_bitmap))
- pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n",
+ pblk_err(pblk, "attempted to erase bb: line:%d, pos:%d\n",
line->id, pos);
/* Not necessary to mark bad blocks on 2.0 spec. */
@@ -194,7 +194,7 @@ void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa)
u64 paddr;
int line_id;
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
/* Callers must ensure that the ppa points to a device address */
BUG_ON(pblk_addr_in_cache(ppa));
BUG_ON(pblk_ppa_empty(ppa));
@@ -264,6 +264,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type)
switch (type) {
case PBLK_WRITE:
kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap);
+ /* fall through */
case PBLK_WRITE_INT:
pool = &pblk->w_rq_pool;
break;
@@ -274,7 +275,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type)
pool = &pblk->e_rq_pool;
break;
default:
- pr_err("pblk: trying to free unknown rqd type\n");
+ pblk_err(pblk, "trying to free unknown rqd type\n");
return;
}
@@ -310,7 +311,7 @@ int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0);
if (ret != PBLK_EXPOSED_PAGE_SIZE) {
- pr_err("pblk: could not add page to bio\n");
+ pblk_err(pblk, "could not add page to bio\n");
mempool_free(page, &pblk->page_bio_pool);
goto err;
}
@@ -410,7 +411,7 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line)
line->state = PBLK_LINESTATE_CORRUPT;
line->gc_group = PBLK_LINEGC_NONE;
move_list = &l_mg->corrupt_list;
- pr_err("pblk: corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n",
+ pblk_err(pblk, "corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n",
line->id, vsc,
line->sec_in_line,
lm->high_thrs, lm->mid_thrs);
@@ -430,7 +431,7 @@ void pblk_discard(struct pblk *pblk, struct bio *bio)
void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd)
{
atomic_long_inc(&pblk->write_failed);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
pblk_print_failed_rqd(pblk, rqd, rqd->error);
#endif
}
@@ -452,9 +453,9 @@ void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd)
atomic_long_inc(&pblk->read_failed);
break;
default:
- pr_err("pblk: unknown read error:%d\n", rqd->error);
+ pblk_err(pblk, "unknown read error:%d\n", rqd->error);
}
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
pblk_print_failed_rqd(pblk, rqd, rqd->error);
#endif
}
@@ -470,7 +471,7 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
atomic_inc(&pblk->inflight_io);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
if (pblk_check_io(pblk, rqd))
return NVM_IO_ERR;
#endif
@@ -484,7 +485,7 @@ int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd)
atomic_inc(&pblk->inflight_io);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
if (pblk_check_io(pblk, rqd))
return NVM_IO_ERR;
#endif
@@ -517,7 +518,7 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
for (i = 0; i < nr_secs; i++) {
page = vmalloc_to_page(kaddr);
if (!page) {
- pr_err("pblk: could not map vmalloc bio\n");
+ pblk_err(pblk, "could not map vmalloc bio\n");
bio_put(bio);
bio = ERR_PTR(-ENOMEM);
goto out;
@@ -525,7 +526,7 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
ret = bio_add_pc_page(dev->q, bio, page, PAGE_SIZE, 0);
if (ret != PAGE_SIZE) {
- pr_err("pblk: could not add page to bio\n");
+ pblk_err(pblk, "could not add page to bio\n");
bio_put(bio);
bio = ERR_PTR(-ENOMEM);
goto out;
@@ -711,7 +712,7 @@ next_rq:
while (test_bit(pos, line->blk_bitmap)) {
paddr += min;
if (pblk_boundary_paddr_checks(pblk, paddr)) {
- pr_err("pblk: corrupt emeta line:%d\n",
+ pblk_err(pblk, "corrupt emeta line:%d\n",
line->id);
bio_put(bio);
ret = -EINTR;
@@ -723,7 +724,7 @@ next_rq:
}
if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
- pr_err("pblk: corrupt emeta line:%d\n",
+ pblk_err(pblk, "corrupt emeta line:%d\n",
line->id);
bio_put(bio);
ret = -EINTR;
@@ -738,7 +739,7 @@ next_rq:
ret = pblk_submit_io_sync(pblk, &rqd);
if (ret) {
- pr_err("pblk: emeta I/O submission failed: %d\n", ret);
+ pblk_err(pblk, "emeta I/O submission failed: %d\n", ret);
bio_put(bio);
goto free_rqd_dma;
}
@@ -843,7 +844,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
*/
ret = pblk_submit_io_sync(pblk, &rqd);
if (ret) {
- pr_err("pblk: smeta I/O submission failed: %d\n", ret);
+ pblk_err(pblk, "smeta I/O submission failed: %d\n", ret);
bio_put(bio);
goto free_ppa_list;
}
@@ -905,7 +906,7 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
- pr_err("pblk: could not sync erase line:%d,blk:%d\n",
+ pblk_err(pblk, "could not sync erase line:%d,blk:%d\n",
pblk_ppa_to_line(ppa),
pblk_ppa_to_pos(geo, ppa));
@@ -945,7 +946,7 @@ int pblk_line_erase(struct pblk *pblk, struct pblk_line *line)
ret = pblk_blk_erase_sync(pblk, ppa);
if (ret) {
- pr_err("pblk: failed to erase line %d\n", line->id);
+ pblk_err(pblk, "failed to erase line %d\n", line->id);
return ret;
}
} while (1);
@@ -1012,7 +1013,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
list_add_tail(&line->list, &l_mg->bad_list);
spin_unlock(&l_mg->free_lock);
- pr_debug("pblk: line %d is bad\n", line->id);
+ pblk_debug(pblk, "line %d is bad\n", line->id);
return 0;
}
@@ -1122,7 +1123,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
line->cur_sec = off + lm->smeta_sec;
if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) {
- pr_debug("pblk: line smeta I/O failed. Retry\n");
+ pblk_debug(pblk, "line smeta I/O failed. Retry\n");
return 0;
}
@@ -1154,7 +1155,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
spin_unlock(&line->lock);
list_add_tail(&line->list, &l_mg->bad_list);
- pr_err("pblk: unexpected line %d is bad\n", line->id);
+ pblk_err(pblk, "unexpected line %d is bad\n", line->id);
return 0;
}
@@ -1299,7 +1300,7 @@ struct pblk_line *pblk_line_get(struct pblk *pblk)
retry:
if (list_empty(&l_mg->free_list)) {
- pr_err("pblk: no free lines\n");
+ pblk_err(pblk, "no free lines\n");
return NULL;
}
@@ -1315,7 +1316,7 @@ retry:
list_add_tail(&line->list, &l_mg->bad_list);
- pr_debug("pblk: line %d is bad\n", line->id);
+ pblk_debug(pblk, "line %d is bad\n", line->id);
goto retry;
}
@@ -1329,7 +1330,7 @@ retry:
list_add(&line->list, &l_mg->corrupt_list);
goto retry;
default:
- pr_err("pblk: failed to prepare line %d\n", line->id);
+ pblk_err(pblk, "failed to prepare line %d\n", line->id);
list_add(&line->list, &l_mg->free_list);
l_mg->nr_free_lines++;
return NULL;
@@ -1477,7 +1478,7 @@ static void pblk_line_close_meta_sync(struct pblk *pblk)
ret = pblk_submit_meta_io(pblk, line);
if (ret) {
- pr_err("pblk: sync meta line %d failed (%d)\n",
+ pblk_err(pblk, "sync meta line %d failed (%d)\n",
line->id, ret);
return;
}
@@ -1507,7 +1508,7 @@ void __pblk_pipeline_flush(struct pblk *pblk)
ret = pblk_recov_pad(pblk);
if (ret) {
- pr_err("pblk: could not close data on teardown(%d)\n", ret);
+ pblk_err(pblk, "could not close data on teardown(%d)\n", ret);
return;
}
@@ -1687,7 +1688,7 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
- pr_err("pblk: could not async erase line:%d,blk:%d\n",
+ pblk_err(pblk, "could not async erase line:%d,blk:%d\n",
pblk_ppa_to_line(ppa),
pblk_ppa_to_pos(geo, ppa));
}
@@ -1726,7 +1727,7 @@ void pblk_line_close(struct pblk *pblk, struct pblk_line *line)
struct list_head *move_list;
int i;
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
WARN(!bitmap_full(line->map_bitmap, lm->sec_per_line),
"pblk: corrupt closed line %d\n", line->id);
#endif
@@ -1856,7 +1857,7 @@ static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list,
* Only send one inflight I/O per LUN. Since we map at a page
* granurality, all ppas in the I/O will map to the same LUN
*/
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
int i;
for (i = 1; i < nr_ppas; i++)
@@ -1866,7 +1867,8 @@ static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list,
ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000));
if (ret == -ETIME || ret == -EINTR)
- pr_err("pblk: taking lun semaphore timed out: err %d\n", -ret);
+ pblk_err(pblk, "taking lun semaphore timed out: err %d\n",
+ -ret);
}
void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas)
@@ -1901,7 +1903,7 @@ void pblk_up_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas)
struct pblk_lun *rlun;
int pos = pblk_ppa_to_pos(geo, ppa_list[0]);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
int i;
for (i = 1; i < nr_ppas; i++)
@@ -1951,7 +1953,7 @@ void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
{
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
/* Callers must ensure that the ppa points to a cache address */
BUG_ON(!pblk_addr_in_cache(ppa));
BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa)));
@@ -1966,7 +1968,7 @@ int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new,
struct ppa_addr ppa_l2p, ppa_gc;
int ret = 1;
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
/* Callers must ensure that the ppa points to a cache address */
BUG_ON(!pblk_addr_in_cache(ppa_new));
BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa_new)));
@@ -2003,14 +2005,14 @@ void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
{
struct ppa_addr ppa_l2p;
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
/* Callers must ensure that the ppa points to a device address */
BUG_ON(pblk_addr_in_cache(ppa_mapped));
#endif
/* Invalidate and discard padded entries */
if (lba == ADDR_EMPTY) {
atomic64_inc(&pblk->pad_wa);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_inc(&pblk->padded_wb);
#endif
if (!pblk_ppa_empty(ppa_mapped))
@@ -2036,7 +2038,7 @@ void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
goto out;
}
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p));
#endif
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index 080469d90b40..157c2567c9e8 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -90,7 +90,7 @@ static void pblk_gc_line_ws(struct work_struct *work)
gc_rq->data = vmalloc(array_size(gc_rq->nr_secs, geo->csecs));
if (!gc_rq->data) {
- pr_err("pblk: could not GC line:%d (%d/%d)\n",
+ pblk_err(pblk, "could not GC line:%d (%d/%d)\n",
line->id, *line->vsc, gc_rq->nr_secs);
goto out;
}
@@ -98,7 +98,7 @@ static void pblk_gc_line_ws(struct work_struct *work)
/* Read from GC victim block */
ret = pblk_submit_read_gc(pblk, gc_rq);
if (ret) {
- pr_err("pblk: failed GC read in line:%d (err:%d)\n",
+ pblk_err(pblk, "failed GC read in line:%d (err:%d)\n",
line->id, ret);
goto out;
}
@@ -146,7 +146,7 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
ret = pblk_line_read_emeta(pblk, line, emeta_buf);
if (ret) {
- pr_err("pblk: line %d read emeta failed (%d)\n",
+ pblk_err(pblk, "line %d read emeta failed (%d)\n",
line->id, ret);
pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
return NULL;
@@ -160,7 +160,7 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
ret = pblk_recov_check_emeta(pblk, emeta_buf);
if (ret) {
- pr_err("pblk: inconsistent emeta (line %d)\n",
+ pblk_err(pblk, "inconsistent emeta (line %d)\n",
line->id);
pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
return NULL;
@@ -201,7 +201,7 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
} else {
lba_list = get_lba_list_from_emeta(pblk, line);
if (!lba_list) {
- pr_err("pblk: could not interpret emeta (line %d)\n",
+ pblk_err(pblk, "could not interpret emeta (line %d)\n",
line->id);
goto fail_free_invalid_bitmap;
}
@@ -213,7 +213,7 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
spin_unlock(&line->lock);
if (sec_left < 0) {
- pr_err("pblk: corrupted GC line (%d)\n", line->id);
+ pblk_err(pblk, "corrupted GC line (%d)\n", line->id);
goto fail_free_lba_list;
}
@@ -289,7 +289,7 @@ fail_free_ws:
kref_put(&line->ref, pblk_line_put);
atomic_dec(&gc->read_inflight_gc);
- pr_err("pblk: Failed to GC line %d\n", line->id);
+ pblk_err(pblk, "failed to GC line %d\n", line->id);
}
static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
@@ -297,7 +297,7 @@ static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
struct pblk_gc *gc = &pblk->gc;
struct pblk_line_ws *line_ws;
- pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id);
+ pblk_debug(pblk, "line '%d' being reclaimed for GC\n", line->id);
line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL);
if (!line_ws)
@@ -351,7 +351,7 @@ static int pblk_gc_read(struct pblk *pblk)
pblk_gc_kick(pblk);
if (pblk_gc_line(pblk, line))
- pr_err("pblk: failed to GC line %d\n", line->id);
+ pblk_err(pblk, "failed to GC line %d\n", line->id);
return 0;
}
@@ -522,8 +522,8 @@ static int pblk_gc_reader_ts(void *data)
io_schedule();
}
-#ifdef CONFIG_NVM_DEBUG
- pr_info("pblk: flushing gc pipeline, %d lines left\n",
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ pblk_info(pblk, "flushing gc pipeline, %d lines left\n",
atomic_read(&gc->pipeline_gc));
#endif
@@ -540,7 +540,7 @@ static int pblk_gc_reader_ts(void *data)
static void pblk_gc_start(struct pblk *pblk)
{
pblk->gc.gc_active = 1;
- pr_debug("pblk: gc start\n");
+ pblk_debug(pblk, "gc start\n");
}
void pblk_gc_should_start(struct pblk *pblk)
@@ -605,14 +605,14 @@ int pblk_gc_init(struct pblk *pblk)
gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts");
if (IS_ERR(gc->gc_ts)) {
- pr_err("pblk: could not allocate GC main kthread\n");
+ pblk_err(pblk, "could not allocate GC main kthread\n");
return PTR_ERR(gc->gc_ts);
}
gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk,
"pblk-gc-writer-ts");
if (IS_ERR(gc->gc_writer_ts)) {
- pr_err("pblk: could not allocate GC writer kthread\n");
+ pblk_err(pblk, "could not allocate GC writer kthread\n");
ret = PTR_ERR(gc->gc_writer_ts);
goto fail_free_main_kthread;
}
@@ -620,7 +620,7 @@ int pblk_gc_init(struct pblk *pblk)
gc->gc_reader_ts = kthread_create(pblk_gc_reader_ts, pblk,
"pblk-gc-reader-ts");
if (IS_ERR(gc->gc_reader_ts)) {
- pr_err("pblk: could not allocate GC reader kthread\n");
+ pblk_err(pblk, "could not allocate GC reader kthread\n");
ret = PTR_ERR(gc->gc_reader_ts);
goto fail_free_writer_kthread;
}
@@ -641,7 +641,7 @@ int pblk_gc_init(struct pblk *pblk)
gc->gc_line_reader_wq = alloc_workqueue("pblk-gc-line-reader-wq",
WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_GC_MAX_READERS);
if (!gc->gc_line_reader_wq) {
- pr_err("pblk: could not allocate GC line reader workqueue\n");
+ pblk_err(pblk, "could not allocate GC line reader workqueue\n");
ret = -ENOMEM;
goto fail_free_reader_kthread;
}
@@ -650,7 +650,7 @@ int pblk_gc_init(struct pblk *pblk)
gc->gc_reader_wq = alloc_workqueue("pblk-gc-line_wq",
WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
if (!gc->gc_reader_wq) {
- pr_err("pblk: could not allocate GC reader workqueue\n");
+ pblk_err(pblk, "could not allocate GC reader workqueue\n");
ret = -ENOMEM;
goto fail_free_reader_line_wq;
}
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index b57f764d6a16..537e98f2b24a 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -91,7 +91,7 @@ static size_t pblk_trans_map_size(struct pblk *pblk)
return entry_size * pblk->rl.nr_secs;
}
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
static u32 pblk_l2p_crc(struct pblk *pblk)
{
size_t map_size;
@@ -117,13 +117,13 @@ static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
} else {
line = pblk_recov_l2p(pblk);
if (IS_ERR(line)) {
- pr_err("pblk: could not recover l2p table\n");
+ pblk_err(pblk, "could not recover l2p table\n");
return -EFAULT;
}
}
-#ifdef CONFIG_NVM_DEBUG
- pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk));
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ pblk_info(pblk, "init: L2P CRC: %x\n", pblk_l2p_crc(pblk));
#endif
/* Free full lines directly as GC has not been started yet */
@@ -166,7 +166,7 @@ static int pblk_l2p_init(struct pblk *pblk, bool factory_init)
static void pblk_rwb_free(struct pblk *pblk)
{
if (pblk_rb_tear_down_check(&pblk->rwb))
- pr_err("pblk: write buffer error on tear down\n");
+ pblk_err(pblk, "write buffer error on tear down\n");
pblk_rb_data_free(&pblk->rwb);
vfree(pblk_rb_entries_ref(&pblk->rwb));
@@ -179,11 +179,14 @@ static int pblk_rwb_init(struct pblk *pblk)
struct pblk_rb_entry *entries;
unsigned long nr_entries, buffer_size;
unsigned int power_size, power_seg_sz;
+ int pgs_in_buffer;
- if (write_buffer_size && (write_buffer_size > pblk->pgs_in_buffer))
+ pgs_in_buffer = max(geo->mw_cunits, geo->ws_opt) * geo->all_luns;
+
+ if (write_buffer_size && (write_buffer_size > pgs_in_buffer))
buffer_size = write_buffer_size;
else
- buffer_size = pblk->pgs_in_buffer;
+ buffer_size = pgs_in_buffer;
nr_entries = pblk_rb_calculate_size(buffer_size);
@@ -200,7 +203,8 @@ static int pblk_rwb_init(struct pblk *pblk)
/* Minimum pages needed within a lun */
#define ADDR_POOL_SIZE 64
-static int pblk_set_addrf_12(struct nvm_geo *geo, struct nvm_addrf_12 *dst)
+static int pblk_set_addrf_12(struct pblk *pblk, struct nvm_geo *geo,
+ struct nvm_addrf_12 *dst)
{
struct nvm_addrf_12 *src = (struct nvm_addrf_12 *)&geo->addrf;
int power_len;
@@ -208,14 +212,14 @@ static int pblk_set_addrf_12(struct nvm_geo *geo, struct nvm_addrf_12 *dst)
/* Re-calculate channel and lun format to adapt to configuration */
power_len = get_count_order(geo->num_ch);
if (1 << power_len != geo->num_ch) {
- pr_err("pblk: supports only power-of-two channel config.\n");
+ pblk_err(pblk, "supports only power-of-two channel config.\n");
return -EINVAL;
}
dst->ch_len = power_len;
power_len = get_count_order(geo->num_lun);
if (1 << power_len != geo->num_lun) {
- pr_err("pblk: supports only power-of-two LUN config.\n");
+ pblk_err(pblk, "supports only power-of-two LUN config.\n");
return -EINVAL;
}
dst->lun_len = power_len;
@@ -282,18 +286,19 @@ static int pblk_set_addrf(struct pblk *pblk)
case NVM_OCSSD_SPEC_12:
div_u64_rem(geo->clba, pblk->min_write_pgs, &mod);
if (mod) {
- pr_err("pblk: bad configuration of sectors/pages\n");
+ pblk_err(pblk, "bad configuration of sectors/pages\n");
return -EINVAL;
}
- pblk->addrf_len = pblk_set_addrf_12(geo, (void *)&pblk->addrf);
+ pblk->addrf_len = pblk_set_addrf_12(pblk, geo,
+ (void *)&pblk->addrf);
break;
case NVM_OCSSD_SPEC_20:
pblk->addrf_len = pblk_set_addrf_20(geo, (void *)&pblk->addrf,
- &pblk->uaddrf);
+ &pblk->uaddrf);
break;
default:
- pr_err("pblk: OCSSD revision not supported (%d)\n",
+ pblk_err(pblk, "OCSSD revision not supported (%d)\n",
geo->version);
return -EINVAL;
}
@@ -366,15 +371,13 @@ static int pblk_core_init(struct pblk *pblk)
atomic64_set(&pblk->nr_flush, 0);
pblk->nr_flush_rst = 0;
- pblk->pgs_in_buffer = geo->mw_cunits * geo->all_luns;
-
pblk->min_write_pgs = geo->ws_opt * (geo->csecs / PAGE_SIZE);
max_write_ppas = pblk->min_write_pgs * geo->all_luns;
pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA);
pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
- pr_err("pblk: vector list too big(%u > %u)\n",
+ pblk_err(pblk, "vector list too big(%u > %u)\n",
pblk->max_write_pgs, PBLK_MAX_REQ_ADDRS);
return -EINVAL;
}
@@ -607,7 +610,7 @@ static int pblk_luns_init(struct pblk *pblk)
/* TODO: Implement unbalanced LUN support */
if (geo->num_lun < 0) {
- pr_err("pblk: unbalanced LUN config.\n");
+ pblk_err(pblk, "unbalanced LUN config.\n");
return -EINVAL;
}
@@ -716,10 +719,11 @@ static int pblk_setup_line_meta_12(struct pblk *pblk, struct pblk_line *line,
/*
* In 1.2 spec. chunk state is not persisted by the device. Thus
- * some of the values are reset each time pblk is instantiated.
+ * some of the values are reset each time pblk is instantiated,
+ * so we have to assume that the block is closed.
*/
if (lun_bb_meta[line->id] == NVM_BLK_T_FREE)
- chunk->state = NVM_CHK_ST_FREE;
+ chunk->state = NVM_CHK_ST_CLOSED;
else
chunk->state = NVM_CHK_ST_OFFLINE;
@@ -1026,7 +1030,7 @@ add_emeta_page:
lm->emeta_sec[0], geo->clba);
if (lm->min_blk_line > lm->blk_per_line) {
- pr_err("pblk: config. not supported. Min. LUN in line:%d\n",
+ pblk_err(pblk, "config. not supported. Min. LUN in line:%d\n",
lm->blk_per_line);
return -EINVAL;
}
@@ -1078,7 +1082,7 @@ static int pblk_lines_init(struct pblk *pblk)
}
if (!nr_free_chks) {
- pr_err("pblk: too many bad blocks prevent for sane instance\n");
+ pblk_err(pblk, "too many bad blocks prevent for sane instance\n");
return -EINTR;
}
@@ -1108,7 +1112,7 @@ static int pblk_writer_init(struct pblk *pblk)
int err = PTR_ERR(pblk->writer_ts);
if (err != -EINTR)
- pr_err("pblk: could not allocate writer kthread (%d)\n",
+ pblk_err(pblk, "could not allocate writer kthread (%d)\n",
err);
return err;
}
@@ -1154,7 +1158,7 @@ static void pblk_tear_down(struct pblk *pblk, bool graceful)
pblk_rb_sync_l2p(&pblk->rwb);
pblk_rl_free(&pblk->rl);
- pr_debug("pblk: consistent tear down (graceful:%d)\n", graceful);
+ pblk_debug(pblk, "consistent tear down (graceful:%d)\n", graceful);
}
static void pblk_exit(void *private, bool graceful)
@@ -1165,8 +1169,8 @@ static void pblk_exit(void *private, bool graceful)
pblk_gc_exit(pblk, graceful);
pblk_tear_down(pblk, graceful);
-#ifdef CONFIG_NVM_DEBUG
- pr_info("pblk exit: L2P CRC: %x\n", pblk_l2p_crc(pblk));
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ pblk_info(pblk, "exit: L2P CRC: %x\n", pblk_l2p_crc(pblk));
#endif
pblk_free(pblk);
@@ -1189,34 +1193,35 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
struct pblk *pblk;
int ret;
- /* pblk supports 1.2 and 2.0 versions */
+ pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL);
+ if (!pblk)
+ return ERR_PTR(-ENOMEM);
+
+ pblk->dev = dev;
+ pblk->disk = tdisk;
+ pblk->state = PBLK_STATE_RUNNING;
+ pblk->gc.gc_enabled = 0;
+
if (!(geo->version == NVM_OCSSD_SPEC_12 ||
geo->version == NVM_OCSSD_SPEC_20)) {
- pr_err("pblk: OCSSD version not supported (%u)\n",
+ pblk_err(pblk, "OCSSD version not supported (%u)\n",
geo->version);
+ kfree(pblk);
return ERR_PTR(-EINVAL);
}
if (geo->version == NVM_OCSSD_SPEC_12 && geo->dom & NVM_RSP_L2P) {
- pr_err("pblk: host-side L2P table not supported. (%x)\n",
+ pblk_err(pblk, "host-side L2P table not supported. (%x)\n",
geo->dom);
+ kfree(pblk);
return ERR_PTR(-EINVAL);
}
- pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL);
- if (!pblk)
- return ERR_PTR(-ENOMEM);
-
- pblk->dev = dev;
- pblk->disk = tdisk;
- pblk->state = PBLK_STATE_RUNNING;
- pblk->gc.gc_enabled = 0;
-
spin_lock_init(&pblk->resubmit_lock);
spin_lock_init(&pblk->trans_lock);
spin_lock_init(&pblk->lock);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_set(&pblk->inflight_writes, 0);
atomic_long_set(&pblk->padded_writes, 0);
atomic_long_set(&pblk->padded_wb, 0);
@@ -1241,38 +1246,38 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
ret = pblk_core_init(pblk);
if (ret) {
- pr_err("pblk: could not initialize core\n");
+ pblk_err(pblk, "could not initialize core\n");
goto fail;
}
ret = pblk_lines_init(pblk);
if (ret) {
- pr_err("pblk: could not initialize lines\n");
+ pblk_err(pblk, "could not initialize lines\n");
goto fail_free_core;
}
ret = pblk_rwb_init(pblk);
if (ret) {
- pr_err("pblk: could not initialize write buffer\n");
+ pblk_err(pblk, "could not initialize write buffer\n");
goto fail_free_lines;
}
ret = pblk_l2p_init(pblk, flags & NVM_TARGET_FACTORY);
if (ret) {
- pr_err("pblk: could not initialize maps\n");
+ pblk_err(pblk, "could not initialize maps\n");
goto fail_free_rwb;
}
ret = pblk_writer_init(pblk);
if (ret) {
if (ret != -EINTR)
- pr_err("pblk: could not initialize write thread\n");
+ pblk_err(pblk, "could not initialize write thread\n");
goto fail_free_l2p;
}
ret = pblk_gc_init(pblk);
if (ret) {
- pr_err("pblk: could not initialize gc\n");
+ pblk_err(pblk, "could not initialize gc\n");
goto fail_stop_writer;
}
@@ -1287,8 +1292,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9);
blk_queue_flag_set(QUEUE_FLAG_DISCARD, tqueue);
- pr_info("pblk(%s): luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
- tdisk->disk_name,
+ pblk_info(pblk, "luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
geo->all_luns, pblk->l_mg.nr_lines,
(unsigned long long)pblk->rl.nr_secs,
pblk->rwb.nr_entries);
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index 55e9442a99e2..f6eec0212dfc 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -111,7 +111,7 @@ int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
} while (iter > 0);
up_write(&pblk_rb_lock);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_set(&rb->inflight_flush_point, 0);
#endif
@@ -308,7 +308,7 @@ void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
entry = &rb->entries[ring_pos];
flags = READ_ONCE(entry->w_ctx.flags);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
/* Caller must guarantee that the entry is free */
BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
#endif
@@ -332,7 +332,7 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
entry = &rb->entries[ring_pos];
flags = READ_ONCE(entry->w_ctx.flags);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
/* Caller must guarantee that the entry is free */
BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
#endif
@@ -362,7 +362,7 @@ static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio,
return 0;
}
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_inc(&rb->inflight_flush_point);
#endif
@@ -547,7 +547,7 @@ try:
page = virt_to_page(entry->data);
if (!page) {
- pr_err("pblk: could not allocate write bio page\n");
+ pblk_err(pblk, "could not allocate write bio page\n");
flags &= ~PBLK_WRITTEN_DATA;
flags |= PBLK_SUBMITTED_ENTRY;
/* Release flags on context. Protect from writes */
@@ -557,7 +557,7 @@ try:
if (bio_add_pc_page(q, bio, page, rb->seg_size, 0) !=
rb->seg_size) {
- pr_err("pblk: could not add page to write bio\n");
+ pblk_err(pblk, "could not add page to write bio\n");
flags &= ~PBLK_WRITTEN_DATA;
flags |= PBLK_SUBMITTED_ENTRY;
/* Release flags on context. Protect from writes */
@@ -576,19 +576,19 @@ try:
if (pad) {
if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, pad)) {
- pr_err("pblk: could not pad page in write bio\n");
+ pblk_err(pblk, "could not pad page in write bio\n");
return NVM_IO_ERR;
}
if (pad < pblk->min_write_pgs)
atomic64_inc(&pblk->pad_dist[pad - 1]);
else
- pr_warn("pblk: padding more than min. sectors\n");
+ pblk_warn(pblk, "padding more than min. sectors\n");
atomic64_add(pad, &pblk->pad_wa);
}
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(pad, &pblk->padded_writes);
#endif
@@ -613,7 +613,7 @@ int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
int ret = 1;
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
/* Caller must ensure that the access will not cause an overflow */
BUG_ON(pos >= rb->nr_entries);
#endif
@@ -820,7 +820,7 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
rb->subm,
rb->sync,
rb->l2p_update,
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_read(&rb->inflight_flush_point),
#else
0,
@@ -838,7 +838,7 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
rb->subm,
rb->sync,
rb->l2p_update,
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_read(&rb->inflight_flush_point),
#else
0,
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 18694694e5f0..5a46d7f9302f 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -28,7 +28,7 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
sector_t lba, struct ppa_addr ppa,
int bio_iter, bool advanced_bio)
{
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
/* Callers must ensure that the ppa points to a cache address */
BUG_ON(pblk_ppa_empty(ppa));
BUG_ON(!pblk_addr_in_cache(ppa));
@@ -79,7 +79,7 @@ retry:
WARN_ON(test_and_set_bit(i, read_bitmap));
meta_list[i].lba = cpu_to_le64(lba);
advanced_bio = true;
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_inc(&pblk->cache_reads);
#endif
} else {
@@ -97,7 +97,7 @@ next:
else
rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(nr_secs, &pblk->inflight_reads);
#endif
}
@@ -117,13 +117,13 @@ static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd,
continue;
if (lba != blba + i) {
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
struct ppa_addr *p;
p = (nr_lbas == 1) ? &rqd->ppa_list[i] : &rqd->ppa_addr;
- print_ppa(&pblk->dev->geo, p, "seq", i);
+ print_ppa(pblk, p, "seq", i);
#endif
- pr_err("pblk: corrupted read LBA (%llu/%llu)\n",
+ pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n",
lba, (u64)blba + i);
WARN_ON(1);
}
@@ -149,14 +149,14 @@ static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd,
meta_lba = le64_to_cpu(meta_lba_list[j].lba);
if (lba != meta_lba) {
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
struct ppa_addr *p;
int nr_ppas = rqd->nr_ppas;
p = (nr_ppas == 1) ? &rqd->ppa_list[j] : &rqd->ppa_addr;
- print_ppa(&pblk->dev->geo, p, "seq", j);
+ print_ppa(pblk, p, "seq", j);
#endif
- pr_err("pblk: corrupted read LBA (%llu/%llu)\n",
+ pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n",
lba, meta_lba);
WARN_ON(1);
}
@@ -185,7 +185,7 @@ static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd)
static void pblk_end_user_read(struct bio *bio)
{
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n");
#endif
bio_endio(bio);
@@ -199,7 +199,7 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
struct bio *int_bio = rqd->bio;
unsigned long start_time = r_ctx->start_time;
- generic_end_io_acct(dev->q, READ, &pblk->disk->part0, start_time);
+ generic_end_io_acct(dev->q, REQ_OP_READ, &pblk->disk->part0, start_time);
if (rqd->error)
pblk_log_read_err(pblk, rqd);
@@ -212,7 +212,7 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
if (put_line)
pblk_read_put_rqd_kref(pblk, rqd);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(rqd->nr_ppas, &pblk->sync_reads);
atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads);
#endif
@@ -231,74 +231,36 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
__pblk_end_io_read(pblk, rqd, true);
}
-static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
- struct bio *orig_bio, unsigned int bio_init_idx,
- unsigned long *read_bitmap)
+static void pblk_end_partial_read(struct nvm_rq *rqd)
{
- struct pblk_sec_meta *meta_list = rqd->meta_list;
- struct bio *new_bio;
+ struct pblk *pblk = rqd->private;
+ struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+ struct pblk_pr_ctx *pr_ctx = r_ctx->private;
+ struct bio *new_bio = rqd->bio;
+ struct bio *bio = pr_ctx->orig_bio;
struct bio_vec src_bv, dst_bv;
- void *ppa_ptr = NULL;
- void *src_p, *dst_p;
- dma_addr_t dma_ppa_list = 0;
- __le64 *lba_list_mem, *lba_list_media;
- int nr_secs = rqd->nr_ppas;
+ struct pblk_sec_meta *meta_list = rqd->meta_list;
+ int bio_init_idx = pr_ctx->bio_init_idx;
+ unsigned long *read_bitmap = pr_ctx->bitmap;
+ int nr_secs = pr_ctx->orig_nr_secs;
int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
- int i, ret, hole;
-
- /* Re-use allocated memory for intermediate lbas */
- lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
- lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size);
-
- new_bio = bio_alloc(GFP_KERNEL, nr_holes);
-
- if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
- goto fail_add_pages;
-
- if (nr_holes != new_bio->bi_vcnt) {
- pr_err("pblk: malformed bio\n");
- goto fail;
- }
-
- for (i = 0; i < nr_secs; i++)
- lba_list_mem[i] = meta_list[i].lba;
-
- new_bio->bi_iter.bi_sector = 0; /* internal bio */
- bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
-
- rqd->bio = new_bio;
- rqd->nr_ppas = nr_holes;
- rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
-
- if (unlikely(nr_holes == 1)) {
- ppa_ptr = rqd->ppa_list;
- dma_ppa_list = rqd->dma_ppa_list;
- rqd->ppa_addr = rqd->ppa_list[0];
- }
-
- ret = pblk_submit_io_sync(pblk, rqd);
- if (ret) {
- bio_put(rqd->bio);
- pr_err("pblk: sync read IO submission failed\n");
- goto fail;
- }
-
- if (rqd->error) {
- atomic_long_inc(&pblk->read_failed);
-#ifdef CONFIG_NVM_DEBUG
- pblk_print_failed_rqd(pblk, rqd, rqd->error);
-#endif
- }
+ __le64 *lba_list_mem, *lba_list_media;
+ void *src_p, *dst_p;
+ int hole, i;
if (unlikely(nr_holes == 1)) {
struct ppa_addr ppa;
ppa = rqd->ppa_addr;
- rqd->ppa_list = ppa_ptr;
- rqd->dma_ppa_list = dma_ppa_list;
+ rqd->ppa_list = pr_ctx->ppa_ptr;
+ rqd->dma_ppa_list = pr_ctx->dma_ppa_list;
rqd->ppa_list[0] = ppa;
}
+ /* Re-use allocated memory for intermediate lbas */
+ lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
+ lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size);
+
for (i = 0; i < nr_secs; i++) {
lba_list_media[i] = meta_list[i].lba;
meta_list[i].lba = lba_list_mem[i];
@@ -316,7 +278,7 @@ static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
meta_list[hole].lba = lba_list_media[i];
src_bv = new_bio->bi_io_vec[i++];
- dst_bv = orig_bio->bi_io_vec[bio_init_idx + hole];
+ dst_bv = bio->bi_io_vec[bio_init_idx + hole];
src_p = kmap_atomic(src_bv.bv_page);
dst_p = kmap_atomic(dst_bv.bv_page);
@@ -334,19 +296,107 @@ static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
} while (hole < nr_secs);
bio_put(new_bio);
+ kfree(pr_ctx);
/* restore original request */
rqd->bio = NULL;
rqd->nr_ppas = nr_secs;
+ bio_endio(bio);
__pblk_end_io_read(pblk, rqd, false);
- return NVM_IO_DONE;
+}
-fail:
- /* Free allocated pages in new bio */
+static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
+ unsigned int bio_init_idx,
+ unsigned long *read_bitmap,
+ int nr_holes)
+{
+ struct pblk_sec_meta *meta_list = rqd->meta_list;
+ struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+ struct pblk_pr_ctx *pr_ctx;
+ struct bio *new_bio, *bio = r_ctx->private;
+ __le64 *lba_list_mem;
+ int nr_secs = rqd->nr_ppas;
+ int i;
+
+ /* Re-use allocated memory for intermediate lbas */
+ lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
+
+ new_bio = bio_alloc(GFP_KERNEL, nr_holes);
+
+ if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
+ goto fail_bio_put;
+
+ if (nr_holes != new_bio->bi_vcnt) {
+ WARN_ONCE(1, "pblk: malformed bio\n");
+ goto fail_free_pages;
+ }
+
+ pr_ctx = kmalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL);
+ if (!pr_ctx)
+ goto fail_free_pages;
+
+ for (i = 0; i < nr_secs; i++)
+ lba_list_mem[i] = meta_list[i].lba;
+
+ new_bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
+
+ rqd->bio = new_bio;
+ rqd->nr_ppas = nr_holes;
+ rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
+
+ pr_ctx->ppa_ptr = NULL;
+ pr_ctx->orig_bio = bio;
+ bitmap_copy(pr_ctx->bitmap, read_bitmap, NVM_MAX_VLBA);
+ pr_ctx->bio_init_idx = bio_init_idx;
+ pr_ctx->orig_nr_secs = nr_secs;
+ r_ctx->private = pr_ctx;
+
+ if (unlikely(nr_holes == 1)) {
+ pr_ctx->ppa_ptr = rqd->ppa_list;
+ pr_ctx->dma_ppa_list = rqd->dma_ppa_list;
+ rqd->ppa_addr = rqd->ppa_list[0];
+ }
+ return 0;
+
+fail_free_pages:
pblk_bio_free_pages(pblk, new_bio, 0, new_bio->bi_vcnt);
-fail_add_pages:
- pr_err("pblk: failed to perform partial read\n");
+fail_bio_put:
+ bio_put(new_bio);
+
+ return -ENOMEM;
+}
+
+static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
+ unsigned int bio_init_idx,
+ unsigned long *read_bitmap, int nr_secs)
+{
+ int nr_holes;
+ int ret;
+
+ nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
+
+ if (pblk_setup_partial_read(pblk, rqd, bio_init_idx, read_bitmap,
+ nr_holes))
+ return NVM_IO_ERR;
+
+ rqd->end_io = pblk_end_partial_read;
+
+ ret = pblk_submit_io(pblk, rqd);
+ if (ret) {
+ bio_put(rqd->bio);
+ pblk_err(pblk, "partial read IO submission failed\n");
+ goto err;
+ }
+
+ return NVM_IO_OK;
+
+err:
+ pblk_err(pblk, "failed to perform partial read\n");
+
+ /* Free allocated pages in new bio */
+ pblk_bio_free_pages(pblk, rqd->bio, 0, rqd->bio->bi_vcnt);
__pblk_end_io_read(pblk, rqd, false);
return NVM_IO_ERR;
}
@@ -359,7 +409,7 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_inc(&pblk->inflight_reads);
#endif
@@ -382,7 +432,7 @@ retry:
WARN_ON(test_and_set_bit(0, read_bitmap));
meta_list[0].lba = cpu_to_le64(lba);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_inc(&pblk->cache_reads);
#endif
} else {
@@ -401,7 +451,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
struct pblk_g_ctx *r_ctx;
struct nvm_rq *rqd;
unsigned int bio_init_idx;
- unsigned long read_bitmap; /* Max 64 ppas per request */
+ DECLARE_BITMAP(read_bitmap, NVM_MAX_VLBA);
int ret = NVM_IO_ERR;
/* logic error: lba out-of-bounds. Ignore read request */
@@ -411,9 +461,10 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
return NVM_IO_ERR;
}
- generic_start_io_acct(q, READ, bio_sectors(bio), &pblk->disk->part0);
+ generic_start_io_acct(q, REQ_OP_READ, bio_sectors(bio),
+ &pblk->disk->part0);
- bitmap_zero(&read_bitmap, nr_secs);
+ bitmap_zero(read_bitmap, nr_secs);
rqd = pblk_alloc_rqd(pblk, PBLK_READ);
@@ -436,7 +487,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
&rqd->dma_meta_list);
if (!rqd->meta_list) {
- pr_err("pblk: not able to allocate ppa list\n");
+ pblk_err(pblk, "not able to allocate ppa list\n");
goto fail_rqd_free;
}
@@ -444,32 +495,32 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
- pblk_read_ppalist_rq(pblk, rqd, bio, blba, &read_bitmap);
+ pblk_read_ppalist_rq(pblk, rqd, bio, blba, read_bitmap);
} else {
- pblk_read_rq(pblk, rqd, bio, blba, &read_bitmap);
+ pblk_read_rq(pblk, rqd, bio, blba, read_bitmap);
}
- if (bitmap_full(&read_bitmap, nr_secs)) {
+ if (bitmap_full(read_bitmap, nr_secs)) {
atomic_inc(&pblk->inflight_io);
__pblk_end_io_read(pblk, rqd, false);
return NVM_IO_DONE;
}
/* All sectors are to be read from the device */
- if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) {
+ if (bitmap_empty(read_bitmap, rqd->nr_ppas)) {
struct bio *int_bio = NULL;
/* Clone read bio to deal with read errors internally */
int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set);
if (!int_bio) {
- pr_err("pblk: could not clone read bio\n");
+ pblk_err(pblk, "could not clone read bio\n");
goto fail_end_io;
}
rqd->bio = int_bio;
if (pblk_submit_io(pblk, rqd)) {
- pr_err("pblk: read IO submission failed\n");
+ pblk_err(pblk, "read IO submission failed\n");
ret = NVM_IO_ERR;
goto fail_end_io;
}
@@ -480,8 +531,15 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
/* The read bio request could be partially filled by the write buffer,
* but there are some holes that need to be read from the drive.
*/
- return pblk_partial_read(pblk, rqd, bio, bio_init_idx, &read_bitmap);
+ ret = pblk_partial_read_bio(pblk, rqd, bio_init_idx, read_bitmap,
+ nr_secs);
+ if (ret)
+ goto fail_meta_free;
+
+ return NVM_IO_OK;
+fail_meta_free:
+ nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
fail_rqd_free:
pblk_free_rqd(pblk, rqd, PBLK_READ);
return ret;
@@ -514,7 +572,7 @@ static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
rqd->ppa_list[valid_secs++] = ppa_list_l2p[i];
}
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(valid_secs, &pblk->inflight_reads);
#endif
@@ -548,7 +606,7 @@ static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
rqd->ppa_addr = ppa_l2p;
valid_secs = 1;
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_inc(&pblk->inflight_reads);
#endif
@@ -595,7 +653,8 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len,
PBLK_VMALLOC_META, GFP_KERNEL);
if (IS_ERR(bio)) {
- pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio));
+ pblk_err(pblk, "could not allocate GC bio (%lu)\n",
+ PTR_ERR(bio));
goto err_free_dma;
}
@@ -609,7 +668,7 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
if (pblk_submit_io_sync(pblk, &rqd)) {
ret = -EIO;
- pr_err("pblk: GC read request failed\n");
+ pblk_err(pblk, "GC read request failed\n");
goto err_free_bio;
}
@@ -619,12 +678,12 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
if (rqd.error) {
atomic_long_inc(&pblk->read_failed_gc);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
pblk_print_failed_rqd(pblk, &rqd, rqd.error);
#endif
}
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads);
atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads);
atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads);
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 3a5069183859..e232e47e1353 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -77,7 +77,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
}
if (nr_valid_lbas != nr_lbas)
- pr_err("pblk: line %d - inconsistent lba list(%llu/%llu)\n",
+ pblk_err(pblk, "line %d - inconsistent lba list(%llu/%llu)\n",
line->id, nr_valid_lbas, nr_lbas);
line->left_msecs = 0;
@@ -184,7 +184,7 @@ next_read_rq:
/* If read fails, more padding is needed */
ret = pblk_submit_io_sync(pblk, rqd);
if (ret) {
- pr_err("pblk: I/O submission failed: %d\n", ret);
+ pblk_err(pblk, "I/O submission failed: %d\n", ret);
return ret;
}
@@ -194,7 +194,7 @@ next_read_rq:
* we cannot recover from here. Need FTL log.
*/
if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) {
- pr_err("pblk: L2P recovery failed (%d)\n", rqd->error);
+ pblk_err(pblk, "L2P recovery failed (%d)\n", rqd->error);
return -EINTR;
}
@@ -273,7 +273,7 @@ static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line,
next_pad_rq:
rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
if (rq_ppas < pblk->min_write_pgs) {
- pr_err("pblk: corrupted pad line %d\n", line->id);
+ pblk_err(pblk, "corrupted pad line %d\n", line->id);
goto fail_free_pad;
}
@@ -342,7 +342,7 @@ next_pad_rq:
ret = pblk_submit_io(pblk, rqd);
if (ret) {
- pr_err("pblk: I/O submission failed: %d\n", ret);
+ pblk_err(pblk, "I/O submission failed: %d\n", ret);
pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
goto fail_free_bio;
}
@@ -356,12 +356,12 @@ next_pad_rq:
if (!wait_for_completion_io_timeout(&pad_rq->wait,
msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
- pr_err("pblk: pad write timed out\n");
+ pblk_err(pblk, "pad write timed out\n");
ret = -ETIME;
}
if (!pblk_line_is_full(line))
- pr_err("pblk: corrupted padded line: %d\n", line->id);
+ pblk_err(pblk, "corrupted padded line: %d\n", line->id);
vfree(data);
free_rq:
@@ -461,7 +461,7 @@ next_rq:
ret = pblk_submit_io_sync(pblk, rqd);
if (ret) {
- pr_err("pblk: I/O submission failed: %d\n", ret);
+ pblk_err(pblk, "I/O submission failed: %d\n", ret);
return ret;
}
@@ -501,11 +501,11 @@ next_rq:
ret = pblk_recov_pad_oob(pblk, line, pad_secs);
if (ret)
- pr_err("pblk: OOB padding failed (err:%d)\n", ret);
+ pblk_err(pblk, "OOB padding failed (err:%d)\n", ret);
ret = pblk_recov_read_oob(pblk, line, p, r_ptr);
if (ret)
- pr_err("pblk: OOB read failed (err:%d)\n", ret);
+ pblk_err(pblk, "OOB read failed (err:%d)\n", ret);
left_ppas = 0;
}
@@ -592,7 +592,7 @@ next_rq:
ret = pblk_submit_io_sync(pblk, rqd);
if (ret) {
- pr_err("pblk: I/O submission failed: %d\n", ret);
+ pblk_err(pblk, "I/O submission failed: %d\n", ret);
bio_put(bio);
return ret;
}
@@ -671,14 +671,14 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
ret = pblk_recov_scan_oob(pblk, line, p, &done);
if (ret) {
- pr_err("pblk: could not recover L2P from OOB\n");
+ pblk_err(pblk, "could not recover L2P from OOB\n");
goto out;
}
if (!done) {
ret = pblk_recov_scan_all_oob(pblk, line, p);
if (ret) {
- pr_err("pblk: could not recover L2P from OOB\n");
+ pblk_err(pblk, "could not recover L2P from OOB\n");
goto out;
}
}
@@ -737,14 +737,15 @@ static int pblk_recov_check_line_version(struct pblk *pblk,
struct line_header *header = &emeta->header;
if (header->version_major != EMETA_VERSION_MAJOR) {
- pr_err("pblk: line major version mismatch: %d, expected: %d\n",
- header->version_major, EMETA_VERSION_MAJOR);
+ pblk_err(pblk, "line major version mismatch: %d, expected: %d\n",
+ header->version_major, EMETA_VERSION_MAJOR);
return 1;
}
-#ifdef NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
if (header->version_minor > EMETA_VERSION_MINOR)
- pr_info("pblk: newer line minor version found: %d\n", line_v);
+ pblk_info(pblk, "newer line minor version found: %d\n",
+ header->version_minor);
#endif
return 0;
@@ -851,7 +852,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
continue;
if (smeta_buf->header.version_major != SMETA_VERSION_MAJOR) {
- pr_err("pblk: found incompatible line version %u\n",
+ pblk_err(pblk, "found incompatible line version %u\n",
smeta_buf->header.version_major);
return ERR_PTR(-EINVAL);
}
@@ -863,7 +864,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
}
if (memcmp(pblk->instance_uuid, smeta_buf->header.uuid, 16)) {
- pr_debug("pblk: ignore line %u due to uuid mismatch\n",
+ pblk_debug(pblk, "ignore line %u due to uuid mismatch\n",
i);
continue;
}
@@ -887,7 +888,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
pblk_recov_line_add_ordered(&recov_list, line);
found_lines++;
- pr_debug("pblk: recovering data line %d, seq:%llu\n",
+ pblk_debug(pblk, "recovering data line %d, seq:%llu\n",
line->id, smeta_buf->seq_nr);
}
@@ -947,7 +948,7 @@ next:
line->emeta = NULL;
} else {
if (open_lines > 1)
- pr_err("pblk: failed to recover L2P\n");
+ pblk_err(pblk, "failed to recover L2P\n");
open_lines++;
line->meta_line = meta_line;
@@ -976,7 +977,7 @@ next:
out:
if (found_lines != recovered_lines)
- pr_err("pblk: failed to recover all found lines %d/%d\n",
+ pblk_err(pblk, "failed to recover all found lines %d/%d\n",
found_lines, recovered_lines);
return data_line;
@@ -999,7 +1000,7 @@ int pblk_recov_pad(struct pblk *pblk)
ret = pblk_recov_pad_oob(pblk, line, left_msecs);
if (ret) {
- pr_err("pblk: Tear down padding failed (%d)\n", ret);
+ pblk_err(pblk, "tear down padding failed (%d)\n", ret);
return ret;
}
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
index 88a0a7c407aa..9fc3dfa168b4 100644
--- a/drivers/lightnvm/pblk-sysfs.c
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -268,7 +268,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
spin_unlock(&l_mg->free_lock);
if (nr_free_lines != free_line_cnt)
- pr_err("pblk: corrupted free line list:%d/%d\n",
+ pblk_err(pblk, "corrupted free line list:%d/%d\n",
nr_free_lines, free_line_cnt);
sz = snprintf(page, PAGE_SIZE - sz,
@@ -421,7 +421,7 @@ static ssize_t pblk_sysfs_get_padding_dist(struct pblk *pblk, char *page)
return sz;
}
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page)
{
return snprintf(page, PAGE_SIZE,
@@ -598,7 +598,7 @@ static struct attribute sys_padding_dist = {
.mode = 0644,
};
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
static struct attribute sys_stats_debug_attr = {
.name = "stats",
.mode = 0444,
@@ -619,7 +619,7 @@ static struct attribute *pblk_attrs[] = {
&sys_write_amp_mileage,
&sys_write_amp_trip,
&sys_padding_dist,
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
&sys_stats_debug_attr,
#endif
NULL,
@@ -654,7 +654,7 @@ static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr,
return pblk_sysfs_get_write_amp_trip(pblk, buf);
else if (strcmp(attr->name, "padding_dist") == 0)
return pblk_sysfs_get_padding_dist(pblk, buf);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
else if (strcmp(attr->name, "stats") == 0)
return pblk_sysfs_stats_debug(pblk, buf);
#endif
@@ -697,8 +697,7 @@ int pblk_sysfs_init(struct gendisk *tdisk)
kobject_get(&parent_dev->kobj),
"%s", "pblk");
if (ret) {
- pr_err("pblk: could not register %s/pblk\n",
- tdisk->disk_name);
+ pblk_err(pblk, "could not register\n");
return ret;
}
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index f353e52941f5..ee774a86cf1e 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -38,7 +38,7 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
/* Release flags on context. Protect from writes */
smp_store_release(&w_ctx->flags, flags);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_dec(&rwb->inflight_flush_point);
#endif
}
@@ -51,7 +51,7 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid,
c_ctx->nr_padded);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(rqd->nr_ppas, &pblk->sync_writes);
#endif
@@ -78,7 +78,7 @@ static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd,
unsigned long flags;
unsigned long pos;
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes);
#endif
@@ -196,7 +196,7 @@ static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx)
list_add_tail(&r_ctx->list, &pblk->resubmit_list);
spin_unlock(&pblk->resubmit_lock);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(c_ctx->nr_valid, &pblk->recov_writes);
#endif
}
@@ -238,7 +238,7 @@ static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
recovery = mempool_alloc(&pblk->rec_pool, GFP_ATOMIC);
if (!recovery) {
- pr_err("pblk: could not allocate recovery work\n");
+ pblk_err(pblk, "could not allocate recovery work\n");
return;
}
@@ -258,7 +258,7 @@ static void pblk_end_io_write(struct nvm_rq *rqd)
pblk_end_w_fail(pblk, rqd);
return;
}
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
else
WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n");
#endif
@@ -279,7 +279,7 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd)
if (rqd->error) {
pblk_log_write_err(pblk, rqd);
- pr_err("pblk: metadata I/O failed. Line %d\n", line->id);
+ pblk_err(pblk, "metadata I/O failed. Line %d\n", line->id);
line->w_err_gc->has_write_err = 1;
}
@@ -356,11 +356,11 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
if ((!secs_to_sync && secs_to_flush)
|| (secs_to_sync < 0)
|| (secs_to_sync > secs_avail && !secs_to_flush)) {
- pr_err("pblk: bad sector calculation (a:%d,s:%d,f:%d)\n",
+ pblk_err(pblk, "bad sector calculation (a:%d,s:%d,f:%d)\n",
secs_avail, secs_to_sync, secs_to_flush);
}
#endif
@@ -397,7 +397,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
l_mg->emeta_alloc_type, GFP_KERNEL);
if (IS_ERR(bio)) {
- pr_err("pblk: failed to map emeta io");
+ pblk_err(pblk, "failed to map emeta io");
ret = PTR_ERR(bio);
goto fail_free_rqd;
}
@@ -428,7 +428,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
ret = pblk_submit_io(pblk, rqd);
if (ret) {
- pr_err("pblk: emeta I/O submission failed: %d\n", ret);
+ pblk_err(pblk, "emeta I/O submission failed: %d\n", ret);
goto fail_rollback;
}
@@ -518,7 +518,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
/* Assign lbas to ppas and populate request structure */
err = pblk_setup_w_rq(pblk, rqd, &erase_ppa);
if (err) {
- pr_err("pblk: could not setup write request: %d\n", err);
+ pblk_err(pblk, "could not setup write request: %d\n", err);
return NVM_IO_ERR;
}
@@ -527,7 +527,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
/* Submit data write for current data line */
err = pblk_submit_io(pblk, rqd);
if (err) {
- pr_err("pblk: data I/O submission failed: %d\n", err);
+ pblk_err(pblk, "data I/O submission failed: %d\n", err);
return NVM_IO_ERR;
}
@@ -549,7 +549,8 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
/* Submit metadata write for previous data line */
err = pblk_submit_meta_io(pblk, meta_line);
if (err) {
- pr_err("pblk: metadata I/O submission failed: %d", err);
+ pblk_err(pblk, "metadata I/O submission failed: %d",
+ err);
return NVM_IO_ERR;
}
}
@@ -614,7 +615,7 @@ static int pblk_submit_write(struct pblk *pblk)
secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
secs_to_flush);
if (secs_to_sync > pblk->max_write_pgs) {
- pr_err("pblk: bad buffer sync calculation\n");
+ pblk_err(pblk, "bad buffer sync calculation\n");
return 1;
}
@@ -633,14 +634,14 @@ static int pblk_submit_write(struct pblk *pblk)
if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync,
secs_avail)) {
- pr_err("pblk: corrupted write bio\n");
+ pblk_err(pblk, "corrupted write bio\n");
goto fail_put_bio;
}
if (pblk_submit_io_set(pblk, rqd))
goto fail_free_bio;
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(secs_to_sync, &pblk->sub_writes);
#endif
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 34cc1d64a9d4..4760af7b6499 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -119,6 +119,16 @@ struct pblk_g_ctx {
u64 lba;
};
+/* partial read context */
+struct pblk_pr_ctx {
+ struct bio *orig_bio;
+ DECLARE_BITMAP(bitmap, NVM_MAX_VLBA);
+ unsigned int orig_nr_secs;
+ unsigned int bio_init_idx;
+ void *ppa_ptr;
+ dma_addr_t dma_ppa_list;
+};
+
/* Pad context */
struct pblk_pad_rq {
struct pblk *pblk;
@@ -193,7 +203,7 @@ struct pblk_rb {
spinlock_t w_lock; /* Write lock */
spinlock_t s_lock; /* Sync lock */
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_t inflight_flush_point; /* Not served REQ_FLUSH | REQ_FUA */
#endif
};
@@ -608,9 +618,6 @@ struct pblk {
int min_write_pgs; /* Minimum amount of pages required by controller */
int max_write_pgs; /* Maximum amount of pages supported by controller */
- int pgs_in_buffer; /* Number of pages that need to be held in buffer to
- * guarantee successful reads.
- */
sector_t capacity; /* Device capacity when bad blocks are subtracted */
@@ -639,7 +646,7 @@ struct pblk {
u64 nr_flush_rst; /* Flushes reset value for pad dist.*/
atomic64_t nr_flush; /* Number of flush/fua I/O */
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
/* Non-persistent debug counters, 4kb sector I/Os */
atomic_long_t inflight_writes; /* Inflight writes (user and gc) */
atomic_long_t padded_writes; /* Sectors padded due to flush/fua */
@@ -706,6 +713,15 @@ struct pblk_line_ws {
#define pblk_g_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_g_ctx))
#define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx))
+#define pblk_err(pblk, fmt, ...) \
+ pr_err("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
+#define pblk_info(pblk, fmt, ...) \
+ pr_info("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
+#define pblk_warn(pblk, fmt, ...) \
+ pr_warn("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
+#define pblk_debug(pblk, fmt, ...) \
+ pr_debug("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
+
/*
* pblk ring buffer operations
*/
@@ -1282,20 +1298,22 @@ static inline int pblk_io_aligned(struct pblk *pblk, int nr_secs)
return !(nr_secs % pblk->min_write_pgs);
}
-#ifdef CONFIG_NVM_DEBUG
-static inline void print_ppa(struct nvm_geo *geo, struct ppa_addr *p,
+#ifdef CONFIG_NVM_PBLK_DEBUG
+static inline void print_ppa(struct pblk *pblk, struct ppa_addr *p,
char *msg, int error)
{
+ struct nvm_geo *geo = &pblk->dev->geo;
+
if (p->c.is_cached) {
- pr_err("ppa: (%s: %x) cache line: %llu\n",
+ pblk_err(pblk, "ppa: (%s: %x) cache line: %llu\n",
msg, error, (u64)p->c.line);
} else if (geo->version == NVM_OCSSD_SPEC_12) {
- pr_err("ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n",
+ pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n",
msg, error,
p->g.ch, p->g.lun, p->g.blk,
p->g.pg, p->g.pl, p->g.sec);
} else {
- pr_err("ppa: (%s: %x):ch:%d,lun:%d,chk:%d,sec:%d\n",
+ pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,chk:%d,sec:%d\n",
msg, error,
p->m.grp, p->m.pu, p->m.chk, p->m.sec);
}
@@ -1307,16 +1325,16 @@ static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd,
int bit = -1;
if (rqd->nr_ppas == 1) {
- print_ppa(&pblk->dev->geo, &rqd->ppa_addr, "rqd", error);
+ print_ppa(pblk, &rqd->ppa_addr, "rqd", error);
return;
}
while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas,
bit + 1)) < rqd->nr_ppas) {
- print_ppa(&pblk->dev->geo, &rqd->ppa_list[bit], "rqd", error);
+ print_ppa(pblk, &rqd->ppa_list[bit], "rqd", error);
}
- pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
+ pblk_err(pblk, "error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
}
static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
@@ -1347,7 +1365,7 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
continue;
}
- print_ppa(geo, ppa, "boundary", i);
+ print_ppa(tgt_dev->q->queuedata, ppa, "boundary", i);
return 1;
}
@@ -1377,7 +1395,7 @@ static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd)
spin_lock(&line->lock);
if (line->state != PBLK_LINESTATE_OPEN) {
- pr_err("pblk: bad ppa: line:%d,state:%d\n",
+ pblk_err(pblk, "bad ppa: line:%d,state:%d\n",
line->id, line->state);
WARN_ON(1);
spin_unlock(&line->lock);
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d6bf294f3907..05f82ff6f016 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -328,13 +328,6 @@ struct cached_dev {
*/
atomic_t has_dirty;
- /*
- * Set to zero by things that touch the backing volume-- except
- * writeback. Incremented by writeback. Used to determine when to
- * accelerate idle writeback.
- */
- atomic_t backing_idle;
-
struct bch_ratelimit writeback_rate;
struct delayed_work writeback_rate_update;
@@ -423,9 +416,9 @@ struct cache {
/*
* When allocating new buckets, prio_write() gets first dibs - since we
* may not be allocate at all without writing priorities and gens.
- * prio_buckets[] contains the last buckets we wrote priorities to (so
- * gc can mark them as metadata), prio_next[] contains the buckets
- * allocated for the next prio write.
+ * prio_last_buckets[] contains the last buckets we wrote priorities to
+ * (so gc can mark them as metadata), prio_buckets[] contains the
+ * buckets allocated for the next prio write.
*/
uint64_t *prio_buckets;
uint64_t *prio_last_buckets;
@@ -474,6 +467,7 @@ struct cache {
struct gc_stat {
size_t nodes;
+ size_t nodes_pre;
size_t key_bytes;
size_t nkeys;
@@ -514,6 +508,8 @@ struct cache_set {
struct cache_accounting accounting;
unsigned long flags;
+ atomic_t idle_counter;
+ atomic_t at_max_writeback_rate;
struct cache_sb sb;
@@ -523,8 +519,10 @@ struct cache_set {
struct bcache_device **devices;
unsigned devices_max_used;
+ atomic_t attached_dev_nr;
struct list_head cached_devs;
uint64_t cached_dev_sectors;
+ atomic_long_t flash_dev_dirty_sectors;
struct closure caching;
struct closure sb_write;
@@ -603,6 +601,10 @@ struct cache_set {
*/
atomic_t rescale;
/*
+ * used for GC, identify if any front side I/Os is inflight
+ */
+ atomic_t search_inflight;
+ /*
* When we invalidate buckets, we use both the priority and the amount
* of good data to determine which buckets to reuse first - to weight
* those together consistently we keep track of the smallest nonzero
@@ -995,7 +997,7 @@ void bch_open_buckets_free(struct cache_set *);
int bch_cache_allocator_start(struct cache *ca);
void bch_debug_exit(void);
-int bch_debug_init(struct kobject *);
+void bch_debug_init(struct kobject *kobj);
void bch_request_exit(void);
int bch_request_init(void);
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index f3403b45bc28..596c93b44e9b 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -366,6 +366,10 @@ EXPORT_SYMBOL(bch_btree_keys_init);
/* Binary tree stuff for auxiliary search trees */
+/*
+ * return array index next to j when does in-order traverse
+ * of a binary tree which is stored in a linear array
+ */
static unsigned inorder_next(unsigned j, unsigned size)
{
if (j * 2 + 1 < size) {
@@ -379,6 +383,10 @@ static unsigned inorder_next(unsigned j, unsigned size)
return j;
}
+/*
+ * return array index previous to j when does in-order traverse
+ * of a binary tree which is stored in a linear array
+ */
static unsigned inorder_prev(unsigned j, unsigned size)
{
if (j * 2 < size) {
@@ -421,6 +429,10 @@ static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra)
return j;
}
+/*
+ * Return the cacheline index in bset_tree->data, where j is index
+ * from a linear array which stores the auxiliar binary tree
+ */
static unsigned to_inorder(unsigned j, struct bset_tree *t)
{
return __to_inorder(j, t->size, t->extra);
@@ -441,6 +453,10 @@ static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra)
return j;
}
+/*
+ * Return an index from a linear array which stores the auxiliar binary
+ * tree, j is the cacheline index of t->data.
+ */
static unsigned inorder_to_tree(unsigned j, struct bset_tree *t)
{
return __inorder_to_tree(j, t->size, t->extra);
@@ -546,6 +562,20 @@ static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift)
return low;
}
+/*
+ * Calculate mantissa value for struct bkey_float.
+ * If most significant bit of f->exponent is not set, then
+ * - f->exponent >> 6 is 0
+ * - p[0] points to bkey->low
+ * - p[-1] borrows bits from KEY_INODE() of bkey->high
+ * if most isgnificant bits of f->exponent is set, then
+ * - f->exponent >> 6 is 1
+ * - p[0] points to bits from KEY_INODE() of bkey->high
+ * - p[-1] points to other bits from KEY_INODE() of
+ * bkey->high too.
+ * See make_bfloat() to check when most significant bit of f->exponent
+ * is set or not.
+ */
static inline unsigned bfloat_mantissa(const struct bkey *k,
struct bkey_float *f)
{
@@ -570,6 +600,16 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
BUG_ON(m < l || m > r);
BUG_ON(bkey_next(p) != m);
+ /*
+ * If l and r have different KEY_INODE values (different backing
+ * device), f->exponent records how many least significant bits
+ * are different in KEY_INODE values and sets most significant
+ * bits to 1 (by +64).
+ * If l and r have same KEY_INODE value, f->exponent records
+ * how many different bits in least significant bits of bkey->low.
+ * See bfloat_mantiss() how the most significant bit of
+ * f->exponent is used to calculate bfloat mantissa value.
+ */
if (KEY_INODE(l) != KEY_INODE(r))
f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64;
else
@@ -633,6 +673,15 @@ void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic)
}
EXPORT_SYMBOL(bch_bset_init_next);
+/*
+ * Build auxiliary binary tree 'struct bset_tree *t', this tree is used to
+ * accelerate bkey search in a btree node (pointed by bset_tree->data in
+ * memory). After search in the auxiliar tree by calling bset_search_tree(),
+ * a struct bset_search_iter is returned which indicates range [l, r] from
+ * bset_tree->data where the searching bkey might be inside. Then a followed
+ * linear comparison does the exact search, see __bch_bset_search() for how
+ * the auxiliary tree is used.
+ */
void bch_bset_build_written_tree(struct btree_keys *b)
{
struct bset_tree *t = bset_tree_last(b);
@@ -898,6 +947,17 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t,
unsigned inorder, j, n = 1;
do {
+ /*
+ * A bit trick here.
+ * If p < t->size, (int)(p - t->size) is a minus value and
+ * the most significant bit is set, right shifting 31 bits
+ * gets 1. If p >= t->size, the most significant bit is
+ * not set, right shifting 31 bits gets 0.
+ * So the following 2 lines equals to
+ * if (p >= t->size)
+ * p = 0;
+ * but a branch instruction is avoided.
+ */
unsigned p = n << 4;
p &= ((int) (p - t->size)) >> 31;
@@ -907,6 +967,9 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t,
f = &t->tree[j];
/*
+ * Similar bit trick, use subtract operation to avoid a branch
+ * instruction.
+ *
* n = (f->mantissa > bfloat_mantissa())
* ? j * 2
* : j * 2 + 1;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 547c9eedc2f4..c19f7716df88 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -90,6 +90,9 @@
#define MAX_NEED_GC 64
#define MAX_SAVE_PRIO 72
+#define MAX_GC_TIMES 100
+#define MIN_GC_NODES 100
+#define GC_SLEEP_MS 100
#define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
@@ -1008,6 +1011,13 @@ retry:
BUG_ON(b->level != level);
}
+ if (btree_node_io_error(b)) {
+ rw_unlock(write, b);
+ return ERR_PTR(-EIO);
+ }
+
+ BUG_ON(!b->written);
+
b->parent = parent;
b->accessed = 1;
@@ -1019,13 +1029,6 @@ retry:
for (; i <= b->keys.nsets; i++)
prefetch(b->keys.set[i].data);
- if (btree_node_io_error(b)) {
- rw_unlock(write, b);
- return ERR_PTR(-EIO);
- }
-
- BUG_ON(!b->written);
-
return b;
}
@@ -1520,6 +1523,32 @@ static unsigned btree_gc_count_keys(struct btree *b)
return ret;
}
+static size_t btree_gc_min_nodes(struct cache_set *c)
+{
+ size_t min_nodes;
+
+ /*
+ * Since incremental GC would stop 100ms when front
+ * side I/O comes, so when there are many btree nodes,
+ * if GC only processes constant (100) nodes each time,
+ * GC would last a long time, and the front side I/Os
+ * would run out of the buckets (since no new bucket
+ * can be allocated during GC), and be blocked again.
+ * So GC should not process constant nodes, but varied
+ * nodes according to the number of btree nodes, which
+ * realized by dividing GC into constant(100) times,
+ * so when there are many btree nodes, GC can process
+ * more nodes each time, otherwise, GC will process less
+ * nodes each time (but no less than MIN_GC_NODES)
+ */
+ min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
+ if (min_nodes < MIN_GC_NODES)
+ min_nodes = MIN_GC_NODES;
+
+ return min_nodes;
+}
+
+
static int btree_gc_recurse(struct btree *b, struct btree_op *op,
struct closure *writes, struct gc_stat *gc)
{
@@ -1585,6 +1614,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
r->b = NULL;
+ if (atomic_read(&b->c->search_inflight) &&
+ gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
+ gc->nodes_pre = gc->nodes;
+ ret = -EAGAIN;
+ break;
+ }
+
if (need_resched()) {
ret = -EAGAIN;
break;
@@ -1753,7 +1789,10 @@ static void bch_btree_gc(struct cache_set *c)
closure_sync(&writes);
cond_resched();
- if (ret && ret != -EAGAIN)
+ if (ret == -EAGAIN)
+ schedule_timeout_interruptible(msecs_to_jiffies
+ (GC_SLEEP_MS));
+ else if (ret)
pr_warn("gc failed!");
} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
@@ -1834,8 +1873,14 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
do {
k = bch_btree_iter_next_filter(&iter, &b->keys,
bch_ptr_bad);
- if (k)
+ if (k) {
btree_node_prefetch(b, k);
+ /*
+ * initiallize c->gc_stats.nodes
+ * for incremental GC
+ */
+ b->c->gc_stats.nodes++;
+ }
if (p)
ret = btree(check_recurse, p, b, op);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index d211e2c25b6b..68e9d926134d 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -152,7 +152,7 @@ static inline bool btree_node_ ## flag(struct btree *b) \
{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
\
static inline void set_btree_node_ ## flag(struct btree *b) \
-{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
+{ set_bit(BTREE_NODE_ ## flag, &b->flags); }
enum btree_flags {
BTREE_NODE_io_error,
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 0e14969182c6..618253683d40 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -199,11 +199,16 @@ static const struct file_operations debug_ops = {
.release = single_release
};
-int __init closure_debug_init(void)
+void __init closure_debug_init(void)
{
- closure_debug = debugfs_create_file("closures",
- 0400, bcache_debug, NULL, &debug_ops);
- return IS_ERR_OR_NULL(closure_debug);
+ if (!IS_ERR_OR_NULL(bcache_debug))
+ /*
+ * it is unnecessary to check return value of
+ * debugfs_create_file(), we should not care
+ * about this.
+ */
+ closure_debug = debugfs_create_file(
+ "closures", 0400, bcache_debug, NULL, &debug_ops);
}
#endif
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 71427eb5fdae..7c2c5bc7c88b 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -186,13 +186,13 @@ static inline void closure_sync(struct closure *cl)
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-int closure_debug_init(void);
+void closure_debug_init(void);
void closure_debug_create(struct closure *cl);
void closure_debug_destroy(struct closure *cl);
#else
-static inline int closure_debug_init(void) { return 0; }
+static inline void closure_debug_init(void) {}
static inline void closure_debug_create(struct closure *cl) {}
static inline void closure_debug_destroy(struct closure *cl) {}
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index d030ce3025a6..12034c07257b 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -110,11 +110,15 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
struct bio_vec bv, cbv;
struct bvec_iter iter, citer = { 0 };
- check = bio_clone_kmalloc(bio, GFP_NOIO);
+ check = bio_kmalloc(GFP_NOIO, bio_segments(bio));
if (!check)
return;
+ check->bi_disk = bio->bi_disk;
check->bi_opf = REQ_OP_READ;
+ check->bi_iter.bi_sector = bio->bi_iter.bi_sector;
+ check->bi_iter.bi_size = bio->bi_iter.bi_size;
+ bch_bio_map(check, NULL);
if (bch_bio_alloc_pages(check, GFP_NOIO))
goto out_put;
@@ -248,11 +252,12 @@ void bch_debug_exit(void)
debugfs_remove_recursive(bcache_debug);
}
-int __init bch_debug_init(struct kobject *kobj)
+void __init bch_debug_init(struct kobject *kobj)
{
- if (!IS_ENABLED(CONFIG_DEBUG_FS))
- return 0;
-
+ /*
+ * it is unnecessary to check return value of
+ * debugfs_create_file(), we should not care
+ * about this.
+ */
bcache_debug = debugfs_create_dir("bcache", NULL);
- return IS_ERR_OR_NULL(bcache_debug);
}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 18f1b5239620..10748c626a1d 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -828,6 +828,7 @@ void bch_journal_free(struct cache_set *c)
free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
free_fifo(&c->journal.pin);
+ free_heap(&c->flush_btree);
}
int bch_journal_alloc(struct cache_set *c)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index ae67f5fa8047..7dbe8b6316a0 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -107,7 +107,7 @@ static int bch_keylist_realloc(struct keylist *l, unsigned u64s,
/*
* The journalling code doesn't handle the case where the keys to insert
* is bigger than an empty write: If we just return -ENOMEM here,
- * bio_insert() and bio_invalidate() will insert the keys created so far
+ * bch_data_insert_keys() will insert the keys created so far
* and finish the rest when the keylist is empty.
*/
if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
@@ -667,8 +667,7 @@ static void backing_request_endio(struct bio *bio)
static void bio_complete(struct search *s)
{
if (s->orig_bio) {
- generic_end_io_acct(s->d->disk->queue,
- bio_data_dir(s->orig_bio),
+ generic_end_io_acct(s->d->disk->queue, bio_op(s->orig_bio),
&s->d->disk->part0, s->start_time);
trace_bcache_request_end(s->d, s->orig_bio);
@@ -702,6 +701,8 @@ static void search_free(struct closure *cl)
{
struct search *s = container_of(cl, struct search, cl);
+ atomic_dec(&s->d->c->search_inflight);
+
if (s->iop.bio)
bio_put(s->iop.bio);
@@ -719,6 +720,7 @@ static inline struct search *search_alloc(struct bio *bio,
closure_init(&s->cl, NULL);
do_bio_hook(s, bio, request_endio);
+ atomic_inc(&d->c->search_inflight);
s->orig_bio = bio;
s->cache_miss = NULL;
@@ -1062,8 +1064,7 @@ static void detached_dev_end_io(struct bio *bio)
bio->bi_end_io = ddip->bi_end_io;
bio->bi_private = ddip->bi_private;
- generic_end_io_acct(ddip->d->disk->queue,
- bio_data_dir(bio),
+ generic_end_io_acct(ddip->d->disk->queue, bio_op(bio),
&ddip->d->disk->part0, ddip->start_time);
if (bio->bi_status) {
@@ -1102,6 +1103,44 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
generic_make_request(bio);
}
+static void quit_max_writeback_rate(struct cache_set *c,
+ struct cached_dev *this_dc)
+{
+ int i;
+ struct bcache_device *d;
+ struct cached_dev *dc;
+
+ /*
+ * mutex bch_register_lock may compete with other parallel requesters,
+ * or attach/detach operations on other backing device. Waiting to
+ * the mutex lock may increase I/O request latency for seconds or more.
+ * To avoid such situation, if mutext_trylock() failed, only writeback
+ * rate of current cached device is set to 1, and __update_write_back()
+ * will decide writeback rate of other cached devices (remember now
+ * c->idle_counter is 0 already).
+ */
+ if (mutex_trylock(&bch_register_lock)) {
+ for (i = 0; i < c->devices_max_used; i++) {
+ if (!c->devices[i])
+ continue;
+
+ if (UUID_FLASH_ONLY(&c->uuids[i]))
+ continue;
+
+ d = c->devices[i];
+ dc = container_of(d, struct cached_dev, disk);
+ /*
+ * set writeback rate to default minimum value,
+ * then let update_writeback_rate() to decide the
+ * upcoming rate.
+ */
+ atomic_long_set(&dc->writeback_rate.rate, 1);
+ }
+ mutex_unlock(&bch_register_lock);
+ } else
+ atomic_long_set(&this_dc->writeback_rate.rate, 1);
+}
+
/* Cached devices - read & write stuff */
static blk_qc_t cached_dev_make_request(struct request_queue *q,
@@ -1119,8 +1158,25 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
return BLK_QC_T_NONE;
}
- atomic_set(&dc->backing_idle, 0);
- generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
+ if (likely(d->c)) {
+ if (atomic_read(&d->c->idle_counter))
+ atomic_set(&d->c->idle_counter, 0);
+ /*
+ * If at_max_writeback_rate of cache set is true and new I/O
+ * comes, quit max writeback rate of all cached devices
+ * attached to this cache set, and set at_max_writeback_rate
+ * to false.
+ */
+ if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) {
+ atomic_set(&d->c->at_max_writeback_rate, 0);
+ quit_max_writeback_rate(d->c, dc);
+ }
+ }
+
+ generic_start_io_acct(q,
+ bio_op(bio),
+ bio_sectors(bio),
+ &d->disk->part0);
bio_set_dev(bio, dc->bdev);
bio->bi_iter.bi_sector += dc->sb.data_offset;
@@ -1229,7 +1285,6 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
struct search *s;
struct closure *cl;
struct bcache_device *d = bio->bi_disk->private_data;
- int rw = bio_data_dir(bio);
if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
bio->bi_status = BLK_STS_IOERR;
@@ -1237,7 +1292,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
return BLK_QC_T_NONE;
}
- generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
+ generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0);
s = search_alloc(bio, d);
cl = &s->cl;
@@ -1254,7 +1309,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
flash_dev_nodata,
bcache_wq);
return BLK_QC_T_NONE;
- } else if (rw) {
+ } else if (bio_data_dir(bio)) {
bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
&KEY(d->id, bio->bi_iter.bi_sector, 0),
&KEY(d->id, bio_end_sector(bio), 0));
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index fa4058e43202..55a37641aa95 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -181,7 +181,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
goto err;
}
- sb->last_mount = get_seconds();
+ sb->last_mount = (u32)ktime_get_real_seconds();
err = NULL;
get_page(bh->b_page);
@@ -696,12 +696,14 @@ static void bcache_device_detach(struct bcache_device *d)
{
lockdep_assert_held(&bch_register_lock);
+ atomic_dec(&d->c->attached_dev_nr);
+
if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
struct uuid_entry *u = d->c->uuids + d->id;
SET_UUID_FLASH_ONLY(u, 0);
memcpy(u->uuid, invalid_uuid, 16);
- u->invalidated = cpu_to_le32(get_seconds());
+ u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
bch_uuid_write(d->c);
}
@@ -796,11 +798,12 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
return idx;
if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
- BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
- !(d->disk = alloc_disk(BCACHE_MINORS))) {
- ida_simple_remove(&bcache_device_idx, idx);
- return -ENOMEM;
- }
+ BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
+ goto err;
+
+ d->disk = alloc_disk(BCACHE_MINORS);
+ if (!d->disk)
+ goto err;
set_capacity(d->disk, sectors);
snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
@@ -834,6 +837,11 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
blk_queue_write_cache(q, true, true);
return 0;
+
+err:
+ ida_simple_remove(&bcache_device_idx, idx);
+ return -ENOMEM;
+
}
/* Cached device */
@@ -1027,7 +1035,7 @@ void bch_cached_dev_detach(struct cached_dev *dc)
int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
uint8_t *set_uuid)
{
- uint32_t rtime = cpu_to_le32(get_seconds());
+ uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
struct uuid_entry *u;
struct cached_dev *exist_dc, *t;
@@ -1070,7 +1078,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
(BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
memcpy(u->uuid, invalid_uuid, 16);
- u->invalidated = cpu_to_le32(get_seconds());
+ u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
u = NULL;
}
@@ -1138,6 +1146,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
bch_cached_dev_run(dc);
bcache_device_link(&dc->disk, c, "bdev");
+ atomic_inc(&c->attached_dev_nr);
/* Allow the writeback thread to proceed */
up_write(&dc->writeback_lock);
@@ -1285,6 +1294,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
pr_info("registered backing device %s", dc->backing_dev_name);
list_add(&dc->list, &uncached_devices);
+ /* attach to a matched cache set if it exists */
list_for_each_entry(c, &bch_cache_sets, list)
bch_cached_dev_attach(dc, c, NULL);
@@ -1311,6 +1321,8 @@ static void flash_dev_free(struct closure *cl)
{
struct bcache_device *d = container_of(cl, struct bcache_device, cl);
mutex_lock(&bch_register_lock);
+ atomic_long_sub(bcache_dev_sectors_dirty(d),
+ &d->c->flash_dev_dirty_sectors);
bcache_device_free(d);
mutex_unlock(&bch_register_lock);
kobject_put(&d->kobj);
@@ -1390,7 +1402,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
get_random_bytes(u->uuid, 16);
memset(u->label, 0, 32);
- u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
+ u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds());
SET_UUID_FLASH_ONLY(u, 1);
u->sectors = size >> 9;
@@ -1687,6 +1699,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
c->block_bits = ilog2(sb->block_size);
c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
c->devices_max_used = 0;
+ atomic_set(&c->attached_dev_nr, 0);
c->btree_pages = bucket_pages(c);
if (c->btree_pages > BTREE_MAX_PAGES)
c->btree_pages = max_t(int, c->btree_pages / 4,
@@ -1894,7 +1907,7 @@ static void run_cache_set(struct cache_set *c)
goto err;
closure_sync(&cl);
- c->sb.last_mount = get_seconds();
+ c->sb.last_mount = (u32)ktime_get_real_seconds();
bcache_write_super(c);
list_for_each_entry_safe(dc, t, &uncached_devices, list)
@@ -2163,8 +2176,12 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
if (!try_module_get(THIS_MODULE))
return -EBUSY;
- if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
- !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
+ path = kstrndup(buffer, size, GFP_KERNEL);
+ if (!path)
+ goto err;
+
+ sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
+ if (!sb)
goto err;
err = "failed to open device";
@@ -2324,13 +2341,21 @@ static int __init bcache_init(void)
return bcache_major;
}
- if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
- !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
- bch_request_init() ||
- bch_debug_init(bcache_kobj) || closure_debug_init() ||
+ bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
+ if (!bcache_wq)
+ goto err;
+
+ bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
+ if (!bcache_kobj)
+ goto err;
+
+ if (bch_request_init() ||
sysfs_create_files(bcache_kobj, files))
goto err;
+ bch_debug_init(bcache_kobj);
+ closure_debug_init();
+
return 0;
err:
bcache_exit();
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 225b15aa0340..81d3520b0702 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -149,6 +149,7 @@ SHOW(__bch_cached_dev)
struct cached_dev *dc = container_of(kobj, struct cached_dev,
disk.kobj);
const char *states[] = { "no cache", "clean", "dirty", "inconsistent" };
+ int wb = dc->writeback_running;
#define var(stat) (dc->stat)
@@ -170,7 +171,8 @@ SHOW(__bch_cached_dev)
var_printf(writeback_running, "%i");
var_print(writeback_delay);
var_print(writeback_percent);
- sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9);
+ sysfs_hprint(writeback_rate,
+ wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0);
sysfs_hprint(io_errors, atomic_read(&dc->io_errors));
sysfs_printf(io_error_limit, "%i", dc->error_limit);
sysfs_printf(io_disable, "%i", dc->io_disable);
@@ -188,15 +190,22 @@ SHOW(__bch_cached_dev)
char change[20];
s64 next_io;
- bch_hprint(rate, dc->writeback_rate.rate << 9);
- bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9);
- bch_hprint(target, dc->writeback_rate_target << 9);
- bch_hprint(proportional,dc->writeback_rate_proportional << 9);
- bch_hprint(integral, dc->writeback_rate_integral_scaled << 9);
- bch_hprint(change, dc->writeback_rate_change << 9);
-
- next_io = div64_s64(dc->writeback_rate.next - local_clock(),
- NSEC_PER_MSEC);
+ /*
+ * Except for dirty and target, other values should
+ * be 0 if writeback is not running.
+ */
+ bch_hprint(rate,
+ wb ? atomic_long_read(&dc->writeback_rate.rate) << 9
+ : 0);
+ bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9);
+ bch_hprint(target, dc->writeback_rate_target << 9);
+ bch_hprint(proportional,
+ wb ? dc->writeback_rate_proportional << 9 : 0);
+ bch_hprint(integral,
+ wb ? dc->writeback_rate_integral_scaled << 9 : 0);
+ bch_hprint(change, wb ? dc->writeback_rate_change << 9 : 0);
+ next_io = wb ? div64_s64(dc->writeback_rate.next-local_clock(),
+ NSEC_PER_MSEC) : 0;
return sprintf(buf,
"rate:\t\t%s/sec\n"
@@ -255,8 +264,19 @@ STORE(__cached_dev)
sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
- sysfs_strtoul_clamp(writeback_rate,
- dc->writeback_rate.rate, 1, INT_MAX);
+ if (attr == &sysfs_writeback_rate) {
+ ssize_t ret;
+ long int v = atomic_long_read(&dc->writeback_rate.rate);
+
+ ret = strtoul_safe_clamp(buf, v, 1, INT_MAX);
+
+ if (!ret) {
+ atomic_long_set(&dc->writeback_rate.rate, v);
+ ret = size;
+ }
+
+ return ret;
+ }
sysfs_strtoul_clamp(writeback_rate_update_seconds,
dc->writeback_rate_update_seconds,
@@ -338,8 +358,8 @@ STORE(__cached_dev)
if (!v)
return size;
}
-
- pr_err("Can't attach %s: cache set not found", buf);
+ if (v == -ENOENT)
+ pr_err("Can't attach %s: cache set not found", buf);
return v;
}
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index fc479b026d6d..b15256bcf0e7 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
{
uint64_t now = local_clock();
- d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+ d->next += div_u64(done * NSEC_PER_SEC, atomic_long_read(&d->rate));
/* Bound the time. Don't let us fall further than 2 seconds behind
* (this prevents unnecessary backlog that would make it impossible
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index cced87f8eb27..f7b0133c9d2f 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -442,7 +442,7 @@ struct bch_ratelimit {
* Rate at which we want to do work, in units per second
* The units here correspond to the units passed to bch_next_delay()
*/
- uint32_t rate;
+ atomic_long_t rate;
};
static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index ad45ebe1a74b..481d4cf38ac0 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -27,7 +27,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc)
* flash-only devices
*/
uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
- bcache_flash_devs_sectors_dirty(c);
+ atomic_long_read(&c->flash_dev_dirty_sectors);
/*
* Unfortunately there is no control of global dirty data. If the
@@ -104,11 +104,56 @@ static void __update_writeback_rate(struct cached_dev *dc)
dc->writeback_rate_proportional = proportional_scaled;
dc->writeback_rate_integral_scaled = integral_scaled;
- dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
- dc->writeback_rate.rate = new_rate;
+ dc->writeback_rate_change = new_rate -
+ atomic_long_read(&dc->writeback_rate.rate);
+ atomic_long_set(&dc->writeback_rate.rate, new_rate);
dc->writeback_rate_target = target;
}
+static bool set_at_max_writeback_rate(struct cache_set *c,
+ struct cached_dev *dc)
+{
+ /*
+ * Idle_counter is increased everytime when update_writeback_rate() is
+ * called. If all backing devices attached to the same cache set have
+ * identical dc->writeback_rate_update_seconds values, it is about 6
+ * rounds of update_writeback_rate() on each backing device before
+ * c->at_max_writeback_rate is set to 1, and then max wrteback rate set
+ * to each dc->writeback_rate.rate.
+ * In order to avoid extra locking cost for counting exact dirty cached
+ * devices number, c->attached_dev_nr is used to calculate the idle
+ * throushold. It might be bigger if not all cached device are in write-
+ * back mode, but it still works well with limited extra rounds of
+ * update_writeback_rate().
+ */
+ if (atomic_inc_return(&c->idle_counter) <
+ atomic_read(&c->attached_dev_nr) * 6)
+ return false;
+
+ if (atomic_read(&c->at_max_writeback_rate) != 1)
+ atomic_set(&c->at_max_writeback_rate, 1);
+
+ atomic_long_set(&dc->writeback_rate.rate, INT_MAX);
+
+ /* keep writeback_rate_target as existing value */
+ dc->writeback_rate_proportional = 0;
+ dc->writeback_rate_integral_scaled = 0;
+ dc->writeback_rate_change = 0;
+
+ /*
+ * Check c->idle_counter and c->at_max_writeback_rate agagain in case
+ * new I/O arrives during before set_at_max_writeback_rate() returns.
+ * Then the writeback rate is set to 1, and its new value should be
+ * decided via __update_writeback_rate().
+ */
+ if ((atomic_read(&c->idle_counter) <
+ atomic_read(&c->attached_dev_nr) * 6) ||
+ !atomic_read(&c->at_max_writeback_rate))
+ return false;
+
+ return true;
+}
+
static void update_writeback_rate(struct work_struct *work)
{
struct cached_dev *dc = container_of(to_delayed_work(work),
@@ -136,13 +181,20 @@ static void update_writeback_rate(struct work_struct *work)
return;
}
- down_read(&dc->writeback_lock);
-
- if (atomic_read(&dc->has_dirty) &&
- dc->writeback_percent)
- __update_writeback_rate(dc);
+ if (atomic_read(&dc->has_dirty) && dc->writeback_percent) {
+ /*
+ * If the whole cache set is idle, set_at_max_writeback_rate()
+ * will set writeback rate to a max number. Then it is
+ * unncessary to update writeback rate for an idle cache set
+ * in maximum writeback rate number(s).
+ */
+ if (!set_at_max_writeback_rate(c, dc)) {
+ down_read(&dc->writeback_lock);
+ __update_writeback_rate(dc);
+ up_read(&dc->writeback_lock);
+ }
+ }
- up_read(&dc->writeback_lock);
/*
* CACHE_SET_IO_DISABLE might be set via sysfs interface,
@@ -422,27 +474,6 @@ static void read_dirty(struct cached_dev *dc)
delay = writeback_delay(dc, size);
- /* If the control system would wait for at least half a
- * second, and there's been no reqs hitting the backing disk
- * for awhile: use an alternate mode where we have at most
- * one contiguous set of writebacks in flight at a time. If
- * someone wants to do IO it will be quick, as it will only
- * have to contend with one operation in flight, and we'll
- * be round-tripping data to the backing disk as quickly as
- * it can accept it.
- */
- if (delay >= HZ / 2) {
- /* 3 means at least 1.5 seconds, up to 7.5 if we
- * have slowed way down.
- */
- if (atomic_inc_return(&dc->backing_idle) >= 3) {
- /* Wait for current I/Os to finish */
- closure_sync(&cl);
- /* And immediately launch a new set. */
- delay = 0;
- }
- }
-
while (!kthread_should_stop() &&
!test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
delay) {
@@ -476,6 +507,9 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
if (!d)
return;
+ if (UUID_FLASH_ONLY(&c->uuids[inode]))
+ atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors);
+
stripe = offset_to_stripe(d, offset);
stripe_offset = offset & (d->stripe_size - 1);
@@ -673,10 +707,14 @@ static int bch_writeback_thread(void *arg)
}
/* Init */
+#define INIT_KEYS_EACH_TIME 500000
+#define INIT_KEYS_SLEEP_MS 100
struct sectors_dirty_init {
struct btree_op op;
unsigned inode;
+ size_t count;
+ struct bkey start;
};
static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
@@ -691,18 +729,37 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
KEY_START(k), KEY_SIZE(k));
+ op->count++;
+ if (atomic_read(&b->c->search_inflight) &&
+ !(op->count % INIT_KEYS_EACH_TIME)) {
+ bkey_copy_key(&op->start, k);
+ return -EAGAIN;
+ }
+
return MAP_CONTINUE;
}
void bch_sectors_dirty_init(struct bcache_device *d)
{
struct sectors_dirty_init op;
+ int ret;
bch_btree_op_init(&op.op, -1);
op.inode = d->id;
-
- bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
- sectors_dirty_init_fn, 0);
+ op.count = 0;
+ op.start = KEY(op.inode, 0, 0);
+
+ do {
+ ret = bch_btree_map_keys(&op.op, d->c, &op.start,
+ sectors_dirty_init_fn, 0);
+ if (ret == -EAGAIN)
+ schedule_timeout_interruptible(
+ msecs_to_jiffies(INIT_KEYS_SLEEP_MS));
+ else if (ret < 0) {
+ pr_warn("sectors dirty init failed, ret=%d!", ret);
+ break;
+ }
+ } while (ret == -EAGAIN);
}
void bch_cached_dev_writeback_init(struct cached_dev *dc)
@@ -715,7 +772,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
dc->writeback_running = true;
dc->writeback_percent = 10;
dc->writeback_delay = 30;
- dc->writeback_rate.rate = 1024;
+ atomic_long_set(&dc->writeback_rate.rate, 1024);
dc->writeback_rate_minimum = 8;
dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 610fb01de629..3745d7004c47 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -28,25 +28,6 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
return ret;
}
-static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c)
-{
- uint64_t i, ret = 0;
-
- mutex_lock(&bch_register_lock);
-
- for (i = 0; i < c->devices_max_used; i++) {
- struct bcache_device *d = c->devices[i];
-
- if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
- continue;
- ret += bcache_dev_sectors_dirty(d);
- }
-
- mutex_unlock(&bch_register_lock);
-
- return ret;
-}
-
static inline unsigned offset_to_stripe(struct bcache_device *d,
uint64_t offset)
{
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b0dd7027848b..20f7e4ef5342 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -609,7 +609,8 @@ static void start_io_acct(struct dm_io *io)
io->start_time = jiffies;
- generic_start_io_acct(md->queue, rw, bio_sectors(bio), &dm_disk(md)->part0);
+ generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
+ &dm_disk(md)->part0);
atomic_set(&dm_disk(md)->part0.in_flight[rw],
atomic_inc_return(&md->pending[rw]));
@@ -628,7 +629,8 @@ static void end_io_acct(struct dm_io *io)
int pending;
int rw = bio_data_dir(bio);
- generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time);
+ generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
+ io->start_time);
if (unlikely(dm_stats_used(&md->stats)))
dm_stats_account_io(&md->stats, bio_data_dir(bio),
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 994aed2f9dff..cb4eb5faa519 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -204,10 +204,6 @@ static int start_readonly;
*/
static bool create_on_open = true;
-/* bio_clone_mddev
- * like bio_clone_bioset, but with a local bio set
- */
-
struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
struct mddev *mddev)
{
@@ -335,6 +331,7 @@ EXPORT_SYMBOL(md_handle_request);
static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
{
const int rw = bio_data_dir(bio);
+ const int sgrp = op_stat_group(bio_op(bio));
struct mddev *mddev = q->queuedata;
unsigned int sectors;
int cpu;
@@ -363,8 +360,8 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
md_handle_request(mddev, bio);
cpu = part_stat_lock();
- part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
- part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
+ part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]);
+ part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors);
part_stat_unlock();
return BLK_QC_T_NONE;
@@ -8046,8 +8043,7 @@ static int is_mddev_idle(struct mddev *mddev, int init)
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) {
struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
- curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
- (int)part_stat_read(&disk->part0, sectors[1]) -
+ curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
atomic_read(&disk->sync_io);
/* sync IO will cause sync_io to increase before the disk_stats
* as sync_io is counted when a request starts, and
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 85de8053aa34..0360c015f658 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1423,11 +1423,11 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip,
struct page *page, unsigned int len, unsigned int off,
- bool is_write, sector_t sector)
+ unsigned int op, sector_t sector)
{
int ret;
- if (!is_write) {
+ if (!op_is_write(op)) {
ret = btt_read_pg(btt, bip, page, off, sector, len);
flush_dcache_page(page);
} else {
@@ -1464,7 +1464,7 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
}
err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset,
- op_is_write(bio_op(bio)), iter.bi_sector);
+ bio_op(bio), iter.bi_sector);
if (err) {
dev_err(&btt->nd_btt->dev,
"io error in %s sector %lld, len %d,\n",
@@ -1483,16 +1483,16 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
}
static int btt_rw_page(struct block_device *bdev, sector_t sector,
- struct page *page, bool is_write)
+ struct page *page, unsigned int op)
{
struct btt *btt = bdev->bd_disk->private_data;
int rc;
unsigned int len;
len = hpage_nr_pages(page) * PAGE_SIZE;
- rc = btt_do_bvec(btt, NULL, page, len, 0, is_write, sector);
+ rc = btt_do_bvec(btt, NULL, page, len, 0, op, sector);
if (rc == 0)
- page_endio(page, is_write, 0);
+ page_endio(page, op_is_write(op), 0);
return rc;
}
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 32e0364b48b9..6ee7fd7e4bbd 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -396,16 +396,15 @@ static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
return false;
*start = jiffies;
- generic_start_io_acct(disk->queue, bio_data_dir(bio),
- bio_sectors(bio), &disk->part0);
+ generic_start_io_acct(disk->queue, bio_op(bio), bio_sectors(bio),
+ &disk->part0);
return true;
}
static inline void nd_iostat_end(struct bio *bio, unsigned long start)
{
struct gendisk *disk = bio->bi_disk;
- generic_end_io_acct(disk->queue, bio_data_dir(bio), &disk->part0,
- start);
+ generic_end_io_acct(disk->queue, bio_op(bio), &disk->part0, start);
}
static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector,
unsigned int len)
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 8b1fd7f1a224..dd17acd8fe68 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -120,7 +120,7 @@ static blk_status_t read_pmem(struct page *page, unsigned int off,
}
static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
- unsigned int len, unsigned int off, bool is_write,
+ unsigned int len, unsigned int off, unsigned int op,
sector_t sector)
{
blk_status_t rc = BLK_STS_OK;
@@ -131,7 +131,7 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
bad_pmem = true;
- if (!is_write) {
+ if (!op_is_write(op)) {
if (unlikely(bad_pmem))
rc = BLK_STS_IOERR;
else {
@@ -180,8 +180,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
do_acct = nd_iostat_start(bio, &start);
bio_for_each_segment(bvec, bio, iter) {
rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
- bvec.bv_offset, op_is_write(bio_op(bio)),
- iter.bi_sector);
+ bvec.bv_offset, bio_op(bio), iter.bi_sector);
if (rc) {
bio->bi_status = rc;
break;
@@ -198,13 +197,13 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
}
static int pmem_rw_page(struct block_device *bdev, sector_t sector,
- struct page *page, bool is_write)
+ struct page *page, unsigned int op)
{
struct pmem_device *pmem = bdev->bd_queue->queuedata;
blk_status_t rc;
rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
- 0, is_write, sector);
+ 0, op, sector);
/*
* The ->rw_page interface is subtle and tricky. The core
@@ -213,7 +212,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
* caused by double completion.
*/
if (rc == 0)
- page_endio(page, is_write, 0);
+ page_endio(page, op_is_write(op), 0);
return blk_status_to_errno(rc);
}
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index bf65501e6ed6..dd8ec1dd9219 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -252,7 +252,8 @@ void nvme_complete_rq(struct request *req)
trace_nvme_complete_rq(req);
if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
- if (nvme_req_needs_failover(req, status)) {
+ if ((req->cmd_flags & REQ_NVME_MPATH) &&
+ blk_path_error(status)) {
nvme_failover_req(req);
return;
}
@@ -617,6 +618,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
return BLK_STS_NOTSUPP;
control |= NVME_RW_PRINFO_PRACT;
+ } else if (req_op(req) == REQ_OP_WRITE) {
+ t10_pi_prepare(req, ns->pi_type);
}
switch (ns->pi_type) {
@@ -627,8 +630,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
case NVME_NS_DPS_PI_TYPE2:
control |= NVME_RW_PRINFO_PRCHK_GUARD |
NVME_RW_PRINFO_PRCHK_REF;
- cmnd->rw.reftag = cpu_to_le32(
- nvme_block_nr(ns, blk_rq_pos(req)));
+ cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
break;
}
}
@@ -638,6 +640,22 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
return 0;
}
+void nvme_cleanup_cmd(struct request *req)
+{
+ if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
+ nvme_req(req)->status == 0) {
+ struct nvme_ns *ns = req->rq_disk->private_data;
+
+ t10_pi_complete(req, ns->pi_type,
+ blk_rq_bytes(req) >> ns->lba_shift);
+ }
+ if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
+ kfree(page_address(req->special_vec.bv_page) +
+ req->special_vec.bv_offset);
+ }
+}
+EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
+
blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmd)
{
@@ -668,10 +686,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
}
cmd->common.command_id = req->tag;
- if (ns)
- trace_nvme_setup_nvm_cmd(req->q->id, cmd);
- else
- trace_nvme_setup_admin_cmd(cmd);
+ trace_nvme_setup_cmd(req, cmd);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_setup_cmd);
@@ -864,9 +879,6 @@ static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
if (unlikely(ctrl->kato == 0))
return;
- INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
- memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
- ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
}
@@ -1056,7 +1068,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
EXPORT_SYMBOL_GPL(nvme_set_queue_count);
#define NVME_AEN_SUPPORTED \
- (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT)
+ (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | NVME_AEN_CFG_ANA_CHANGE)
static void nvme_enable_aen(struct nvme_ctrl *ctrl)
{
@@ -1472,6 +1484,12 @@ static void nvme_update_disk_info(struct gendisk *disk,
set_capacity(disk, capacity);
nvme_config_discard(ns);
+
+ if (id->nsattr & (1 << 0))
+ set_disk_ro(disk, true);
+ else
+ set_disk_ro(disk, false);
+
blk_mq_unfreeze_queue(disk->queue);
}
@@ -2270,21 +2288,16 @@ out_unlock:
return ret;
}
-int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
- u8 log_page, void *log,
- size_t size, u64 offset)
+int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
+ void *log, size_t size, u64 offset)
{
struct nvme_command c = { };
unsigned long dwlen = size / 4 - 1;
c.get_log_page.opcode = nvme_admin_get_log_page;
-
- if (ns)
- c.get_log_page.nsid = cpu_to_le32(ns->head->ns_id);
- else
- c.get_log_page.nsid = cpu_to_le32(NVME_NSID_ALL);
-
+ c.get_log_page.nsid = cpu_to_le32(nsid);
c.get_log_page.lid = log_page;
+ c.get_log_page.lsp = lsp;
c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
@@ -2293,12 +2306,6 @@ int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
}
-static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log,
- size_t size)
-{
- return nvme_get_log_ext(ctrl, NULL, log_page, log, size, 0);
-}
-
static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
{
int ret;
@@ -2309,8 +2316,8 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
if (!ctrl->effects)
return 0;
- ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects,
- sizeof(*ctrl->effects));
+ ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0,
+ ctrl->effects, sizeof(*ctrl->effects), 0);
if (ret) {
kfree(ctrl->effects);
ctrl->effects = NULL;
@@ -2401,6 +2408,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
nvme_set_queue_limits(ctrl, ctrl->admin_q);
ctrl->sgls = le32_to_cpu(id->sgls);
ctrl->kas = le16_to_cpu(id->kas);
+ ctrl->max_namespaces = le32_to_cpu(id->mnan);
if (id->rtd3e) {
/* us -> s */
@@ -2460,8 +2468,12 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
}
+ ret = nvme_mpath_init(ctrl, id);
kfree(id);
+ if (ret < 0)
+ return ret;
+
if (ctrl->apst_enabled && !prev_apst_enabled)
dev_pm_qos_expose_latency_tolerance(ctrl->device);
else if (!ctrl->apst_enabled && prev_apst_enabled)
@@ -2680,6 +2692,10 @@ static struct attribute *nvme_ns_id_attrs[] = {
&dev_attr_nguid.attr,
&dev_attr_eui.attr,
&dev_attr_nsid.attr,
+#ifdef CONFIG_NVME_MULTIPATH
+ &dev_attr_ana_grpid.attr,
+ &dev_attr_ana_state.attr,
+#endif
NULL,
};
@@ -2702,6 +2718,14 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
return 0;
}
+#ifdef CONFIG_NVME_MULTIPATH
+ if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) {
+ if (dev_to_disk(dev)->fops != &nvme_fops) /* per-path attr */
+ return 0;
+ if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl))
+ return 0;
+ }
+#endif
return a->mode;
}
@@ -3075,8 +3099,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
nvme_get_ctrl(ctrl);
- kfree(id);
-
device_add_disk(ctrl->device, ns->disk);
if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
&nvme_ns_id_attr_group))
@@ -3086,8 +3108,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
ns->disk->disk_name);
- nvme_mpath_add_disk(ns->head);
+ nvme_mpath_add_disk(ns, id);
nvme_fault_inject_init(ns);
+ kfree(id);
+
return;
out_unlink_ns:
mutex_lock(&ctrl->subsys->lock);
@@ -3229,7 +3253,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
* raced with us in reading the log page, which could cause us to miss
* updates.
*/
- error = nvme_get_log(ctrl, NVME_LOG_CHANGED_NS, log, log_size);
+ error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log,
+ log_size, 0);
if (error)
dev_warn(ctrl->device,
"reading changed ns log failed: %d\n", error);
@@ -3346,9 +3371,9 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
if (!log)
return;
- if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log)))
- dev_warn(ctrl->device,
- "Get FW SLOT INFO log error\n");
+ if (nvme_get_log(ctrl, NVME_NSID_ALL, 0, NVME_LOG_FW_SLOT, log,
+ sizeof(*log), 0))
+ dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
kfree(log);
}
@@ -3394,6 +3419,13 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
case NVME_AER_NOTICE_FW_ACT_STARTING:
queue_work(nvme_wq, &ctrl->fw_act_work);
break;
+#ifdef CONFIG_NVME_MULTIPATH
+ case NVME_AER_NOTICE_ANA:
+ if (!ctrl->ana_log_buf)
+ break;
+ queue_work(nvme_wq, &ctrl->ana_work);
+ break;
+#endif
default:
dev_warn(ctrl->device, "async event result %08x\n", result);
}
@@ -3426,6 +3458,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
{
+ nvme_mpath_stop(ctrl);
nvme_stop_keep_alive(ctrl);
flush_work(&ctrl->async_event_work);
flush_work(&ctrl->scan_work);
@@ -3463,6 +3496,7 @@ static void nvme_free_ctrl(struct device *dev)
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
kfree(ctrl->effects);
+ nvme_mpath_uninit(ctrl);
if (subsys) {
mutex_lock(&subsys->lock);
@@ -3499,6 +3533,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
+ INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
+ memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
+ ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
+
ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
if (ret < 0)
goto out;
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index f7efe5a58cc7..206d63cb1afc 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -474,7 +474,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue);
bool nvmf_should_reconnect(struct nvme_ctrl *ctrl)
{
- if (ctrl->opts->max_reconnects != -1 &&
+ if (ctrl->opts->max_reconnects == -1 ||
ctrl->nr_reconnects < ctrl->opts->max_reconnects)
return true;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 9bac912173ba..611e70cae754 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1737,6 +1737,7 @@ nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq,
int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
struct nvme_fc_queue *queue = &ctrl->queues[queue_idx];
+ nvme_req(rq)->ctrl = &ctrl->ctrl;
return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++);
}
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 41279da799ed..6fe5923c95d4 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -414,12 +414,6 @@ static int nvme_nvm_setup_20(struct nvme_nvm_id20 *id,
/* Set compacted version for upper layers */
geo->version = NVM_OCSSD_SPEC_20;
- if (!(geo->major_ver_id == 2 && geo->minor_ver_id == 0)) {
- pr_err("nvm: OCSSD version not supported (v%d.%d)\n",
- geo->major_ver_id, geo->minor_ver_id);
- return -EINVAL;
- }
-
geo->num_ch = le16_to_cpu(id->num_grp);
geo->num_lun = le16_to_cpu(id->num_pu);
geo->all_luns = geo->num_ch * geo->num_lun;
@@ -583,7 +577,13 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
struct ppa_addr ppa;
size_t left = nchks * sizeof(struct nvme_nvm_chk_meta);
size_t log_pos, offset, len;
- int ret, i;
+ int ret, i, max_len;
+
+ /*
+ * limit requests to maximum 256K to avoid issuing arbitrary large
+ * requests when the device does not specific a maximum transfer size.
+ */
+ max_len = min_t(unsigned int, ctrl->max_hw_sectors << 9, 256 * 1024);
/* Normalize lba address space to obtain log offset */
ppa.ppa = slba;
@@ -596,10 +596,11 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
offset = log_pos * sizeof(struct nvme_nvm_chk_meta);
while (left) {
- len = min_t(unsigned int, left, ctrl->max_hw_sectors << 9);
+ len = min_t(unsigned int, left, max_len);
- ret = nvme_get_log_ext(ctrl, ns, NVME_NVM_LOG_REPORT_CHUNK,
- dev_meta, len, offset);
+ ret = nvme_get_log(ctrl, ns->head->ns_id,
+ NVME_NVM_LOG_REPORT_CHUNK, 0, dev_meta, len,
+ offset);
if (ret) {
dev_err(ctrl->device, "Get REPORT CHUNK log error\n");
break;
@@ -662,12 +663,10 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q,
rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
- if (rqd->bio) {
+ if (rqd->bio)
blk_init_request_from_bio(rq, rqd->bio);
- } else {
+ else
rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
- rq->__data_len = 0;
- }
return rq;
}
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 1ffd3e8b13a1..5a9562881d4e 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 Christoph Hellwig.
+ * Copyright (c) 2017-2018 Christoph Hellwig.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
@@ -20,6 +20,11 @@ module_param(multipath, bool, 0444);
MODULE_PARM_DESC(multipath,
"turn on native support for multiple controllers per subsystem");
+inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
+{
+ return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
+}
+
/*
* If multipathing is enabled we need to always use the subsystem instance
* number for numbering our devices to avoid conflicts between subsystems that
@@ -45,6 +50,7 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
void nvme_failover_req(struct request *req)
{
struct nvme_ns *ns = req->q->queuedata;
+ u16 status = nvme_req(req)->status;
unsigned long flags;
spin_lock_irqsave(&ns->head->requeue_lock, flags);
@@ -52,15 +58,35 @@ void nvme_failover_req(struct request *req)
spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
blk_mq_end_request(req, 0);
- nvme_reset_ctrl(ns->ctrl);
- kblockd_schedule_work(&ns->head->requeue_work);
-}
+ switch (status & 0x7ff) {
+ case NVME_SC_ANA_TRANSITION:
+ case NVME_SC_ANA_INACCESSIBLE:
+ case NVME_SC_ANA_PERSISTENT_LOSS:
+ /*
+ * If we got back an ANA error we know the controller is alive,
+ * but not ready to serve this namespaces. The spec suggests
+ * we should update our general state here, but due to the fact
+ * that the admin and I/O queues are not serialized that is
+ * fundamentally racy. So instead just clear the current path,
+ * mark the the path as pending and kick of a re-read of the ANA
+ * log page ASAP.
+ */
+ nvme_mpath_clear_current_path(ns);
+ if (ns->ctrl->ana_log_buf) {
+ set_bit(NVME_NS_ANA_PENDING, &ns->flags);
+ queue_work(nvme_wq, &ns->ctrl->ana_work);
+ }
+ break;
+ default:
+ /*
+ * Reset the controller for any non-ANA error as we don't know
+ * what caused the error.
+ */
+ nvme_reset_ctrl(ns->ctrl);
+ break;
+ }
-bool nvme_req_needs_failover(struct request *req, blk_status_t error)
-{
- if (!(req->cmd_flags & REQ_NVME_MPATH))
- return false;
- return blk_path_error(error);
+ kblockd_schedule_work(&ns->head->requeue_work);
}
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
@@ -75,25 +101,51 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
up_read(&ctrl->namespaces_rwsem);
}
+static const char *nvme_ana_state_names[] = {
+ [0] = "invalid state",
+ [NVME_ANA_OPTIMIZED] = "optimized",
+ [NVME_ANA_NONOPTIMIZED] = "non-optimized",
+ [NVME_ANA_INACCESSIBLE] = "inaccessible",
+ [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss",
+ [NVME_ANA_CHANGE] = "change",
+};
+
static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head)
{
- struct nvme_ns *ns;
+ struct nvme_ns *ns, *fallback = NULL;
list_for_each_entry_rcu(ns, &head->list, siblings) {
- if (ns->ctrl->state == NVME_CTRL_LIVE) {
+ if (ns->ctrl->state != NVME_CTRL_LIVE ||
+ test_bit(NVME_NS_ANA_PENDING, &ns->flags))
+ continue;
+ switch (ns->ana_state) {
+ case NVME_ANA_OPTIMIZED:
rcu_assign_pointer(head->current_path, ns);
return ns;
+ case NVME_ANA_NONOPTIMIZED:
+ fallback = ns;
+ break;
+ default:
+ break;
}
}
- return NULL;
+ if (fallback)
+ rcu_assign_pointer(head->current_path, fallback);
+ return fallback;
+}
+
+static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
+{
+ return ns->ctrl->state == NVME_CTRL_LIVE &&
+ ns->ana_state == NVME_ANA_OPTIMIZED;
}
inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
{
struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu);
- if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE))
+ if (unlikely(!ns || !nvme_path_is_optimized(ns)))
ns = __nvme_find_path(head);
return ns;
}
@@ -142,7 +194,7 @@ static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
srcu_idx = srcu_read_lock(&head->srcu);
ns = srcu_dereference(head->current_path, &head->srcu);
- if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE))
+ if (likely(ns && nvme_path_is_optimized(ns)))
found = ns->queue->poll_fn(q, qc);
srcu_read_unlock(&head->srcu, srcu_idx);
return found;
@@ -176,6 +228,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
struct request_queue *q;
bool vwc = false;
+ mutex_init(&head->lock);
bio_list_init(&head->requeue_list);
spin_lock_init(&head->requeue_lock);
INIT_WORK(&head->requeue_work, nvme_requeue_work);
@@ -220,29 +273,232 @@ out:
return -ENOMEM;
}
-void nvme_mpath_add_disk(struct nvme_ns_head *head)
+static void nvme_mpath_set_live(struct nvme_ns *ns)
{
+ struct nvme_ns_head *head = ns->head;
+
+ lockdep_assert_held(&ns->head->lock);
+
if (!head->disk)
return;
- mutex_lock(&head->subsys->lock);
if (!(head->disk->flags & GENHD_FL_UP)) {
device_add_disk(&head->subsys->dev, head->disk);
if (sysfs_create_group(&disk_to_dev(head->disk)->kobj,
&nvme_ns_id_attr_group))
- pr_warn("%s: failed to create sysfs group for identification\n",
- head->disk->disk_name);
+ dev_warn(&head->subsys->dev,
+ "failed to create id group.\n");
+ }
+
+ kblockd_schedule_work(&ns->head->requeue_work);
+}
+
+static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
+ int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
+ void *))
+{
+ void *base = ctrl->ana_log_buf;
+ size_t offset = sizeof(struct nvme_ana_rsp_hdr);
+ int error, i;
+
+ lockdep_assert_held(&ctrl->ana_lock);
+
+ for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
+ struct nvme_ana_group_desc *desc = base + offset;
+ u32 nr_nsids = le32_to_cpu(desc->nnsids);
+ size_t nsid_buf_size = nr_nsids * sizeof(__le32);
+
+ if (WARN_ON_ONCE(desc->grpid == 0))
+ return -EINVAL;
+ if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
+ return -EINVAL;
+ if (WARN_ON_ONCE(desc->state == 0))
+ return -EINVAL;
+ if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
+ return -EINVAL;
+
+ offset += sizeof(*desc);
+ if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
+ return -EINVAL;
+
+ error = cb(ctrl, desc, data);
+ if (error)
+ return error;
+
+ offset += nsid_buf_size;
+ if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static inline bool nvme_state_is_live(enum nvme_ana_state state)
+{
+ return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
+}
+
+static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
+ struct nvme_ns *ns)
+{
+ enum nvme_ana_state old;
+
+ mutex_lock(&ns->head->lock);
+ old = ns->ana_state;
+ ns->ana_grpid = le32_to_cpu(desc->grpid);
+ ns->ana_state = desc->state;
+ clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
+
+ if (nvme_state_is_live(ns->ana_state) && !nvme_state_is_live(old))
+ nvme_mpath_set_live(ns);
+ mutex_unlock(&ns->head->lock);
+}
+
+static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
+ struct nvme_ana_group_desc *desc, void *data)
+{
+ u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
+ unsigned *nr_change_groups = data;
+ struct nvme_ns *ns;
+
+ dev_info(ctrl->device, "ANA group %d: %s.\n",
+ le32_to_cpu(desc->grpid),
+ nvme_ana_state_names[desc->state]);
+
+ if (desc->state == NVME_ANA_CHANGE)
+ (*nr_change_groups)++;
+
+ if (!nr_nsids)
+ return 0;
+
+ down_write(&ctrl->namespaces_rwsem);
+ list_for_each_entry(ns, &ctrl->namespaces, list) {
+ if (ns->head->ns_id != le32_to_cpu(desc->nsids[n]))
+ continue;
+ nvme_update_ns_ana_state(desc, ns);
+ if (++n == nr_nsids)
+ break;
+ }
+ up_write(&ctrl->namespaces_rwsem);
+ WARN_ON_ONCE(n < nr_nsids);
+ return 0;
+}
+
+static int nvme_read_ana_log(struct nvme_ctrl *ctrl, bool groups_only)
+{
+ u32 nr_change_groups = 0;
+ int error;
+
+ mutex_lock(&ctrl->ana_lock);
+ error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA,
+ groups_only ? NVME_ANA_LOG_RGO : 0,
+ ctrl->ana_log_buf, ctrl->ana_log_size, 0);
+ if (error) {
+ dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
+ goto out_unlock;
+ }
+
+ error = nvme_parse_ana_log(ctrl, &nr_change_groups,
+ nvme_update_ana_state);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * In theory we should have an ANATT timer per group as they might enter
+ * the change state at different times. But that is a lot of overhead
+ * just to protect against a target that keeps entering new changes
+ * states while never finishing previous ones. But we'll still
+ * eventually time out once all groups are in change state, so this
+ * isn't a big deal.
+ *
+ * We also double the ANATT value to provide some slack for transports
+ * or AEN processing overhead.
+ */
+ if (nr_change_groups)
+ mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
+ else
+ del_timer_sync(&ctrl->anatt_timer);
+out_unlock:
+ mutex_unlock(&ctrl->ana_lock);
+ return error;
+}
+
+static void nvme_ana_work(struct work_struct *work)
+{
+ struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
+
+ nvme_read_ana_log(ctrl, false);
+}
+
+static void nvme_anatt_timeout(struct timer_list *t)
+{
+ struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
+
+ dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
+ nvme_reset_ctrl(ctrl);
+}
+
+void nvme_mpath_stop(struct nvme_ctrl *ctrl)
+{
+ if (!nvme_ctrl_use_ana(ctrl))
+ return;
+ del_timer_sync(&ctrl->anatt_timer);
+ cancel_work_sync(&ctrl->ana_work);
+}
+
+static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
+}
+DEVICE_ATTR_RO(ana_grpid);
+
+static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+
+ return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
+}
+DEVICE_ATTR_RO(ana_state);
+
+static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl,
+ struct nvme_ana_group_desc *desc, void *data)
+{
+ struct nvme_ns *ns = data;
+
+ if (ns->ana_grpid == le32_to_cpu(desc->grpid)) {
+ nvme_update_ns_ana_state(desc, ns);
+ return -ENXIO; /* just break out of the loop */
+ }
+
+ return 0;
+}
+
+void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
+{
+ if (nvme_ctrl_use_ana(ns->ctrl)) {
+ mutex_lock(&ns->ctrl->ana_lock);
+ ns->ana_grpid = le32_to_cpu(id->anagrpid);
+ nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state);
+ mutex_unlock(&ns->ctrl->ana_lock);
+ } else {
+ mutex_lock(&ns->head->lock);
+ ns->ana_state = NVME_ANA_OPTIMIZED;
+ nvme_mpath_set_live(ns);
+ mutex_unlock(&ns->head->lock);
}
- mutex_unlock(&head->subsys->lock);
}
void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{
if (!head->disk)
return;
- sysfs_remove_group(&disk_to_dev(head->disk)->kobj,
- &nvme_ns_id_attr_group);
- del_gendisk(head->disk);
+ if (head->disk->flags & GENHD_FL_UP) {
+ sysfs_remove_group(&disk_to_dev(head->disk)->kobj,
+ &nvme_ns_id_attr_group);
+ del_gendisk(head->disk);
+ }
blk_set_queue_dying(head->disk->queue);
/* make sure all pending bios are cleaned up */
kblockd_schedule_work(&head->requeue_work);
@@ -250,3 +506,52 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head)
blk_cleanup_queue(head->disk->queue);
put_disk(head->disk);
}
+
+int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
+{
+ int error;
+
+ if (!nvme_ctrl_use_ana(ctrl))
+ return 0;
+
+ ctrl->anacap = id->anacap;
+ ctrl->anatt = id->anatt;
+ ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
+ ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
+
+ mutex_init(&ctrl->ana_lock);
+ timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
+ ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
+ ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc);
+ if (!(ctrl->anacap & (1 << 6)))
+ ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32);
+
+ if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) {
+ dev_err(ctrl->device,
+ "ANA log page size (%zd) larger than MDTS (%d).\n",
+ ctrl->ana_log_size,
+ ctrl->max_hw_sectors << SECTOR_SHIFT);
+ dev_err(ctrl->device, "disabling ANA support.\n");
+ return 0;
+ }
+
+ INIT_WORK(&ctrl->ana_work, nvme_ana_work);
+ ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL);
+ if (!ctrl->ana_log_buf)
+ goto out;
+
+ error = nvme_read_ana_log(ctrl, true);
+ if (error)
+ goto out_free_ana_log_buf;
+ return 0;
+out_free_ana_log_buf:
+ kfree(ctrl->ana_log_buf);
+out:
+ return -ENOMEM;
+}
+
+void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
+{
+ kfree(ctrl->ana_log_buf);
+}
+
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 0c4a33df3b2f..bb4a2003c097 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -102,6 +102,7 @@ struct nvme_request {
u8 retries;
u8 flags;
u16 status;
+ struct nvme_ctrl *ctrl;
};
/*
@@ -119,6 +120,13 @@ static inline struct nvme_request *nvme_req(struct request *req)
return blk_mq_rq_to_pdu(req);
}
+static inline u16 nvme_req_qid(struct request *req)
+{
+ if (!req->rq_disk)
+ return 0;
+ return blk_mq_unique_tag_to_hwq(blk_mq_unique_tag(req)) + 1;
+}
+
/* The below value is the specific amount of delay needed before checking
* readiness in case of the PCI_DEVICE(0x1c58, 0x0003), which needs the
* NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was
@@ -175,6 +183,7 @@ struct nvme_ctrl {
u16 oacs;
u16 nssa;
u16 nr_streams;
+ u32 max_namespaces;
atomic_t abort_limit;
u8 vwc;
u32 vs;
@@ -197,6 +206,19 @@ struct nvme_ctrl {
struct work_struct fw_act_work;
unsigned long events;
+#ifdef CONFIG_NVME_MULTIPATH
+ /* asymmetric namespace access: */
+ u8 anacap;
+ u8 anatt;
+ u32 anagrpmax;
+ u32 nanagrpid;
+ struct mutex ana_lock;
+ struct nvme_ana_rsp_hdr *ana_log_buf;
+ size_t ana_log_size;
+ struct timer_list anatt_timer;
+ struct work_struct ana_work;
+#endif
+
/* Power saving configuration */
u64 ps_max_latency_us;
bool apst_enabled;
@@ -261,6 +283,7 @@ struct nvme_ns_head {
struct bio_list requeue_list;
spinlock_t requeue_lock;
struct work_struct requeue_work;
+ struct mutex lock;
#endif
struct list_head list;
struct srcu_struct srcu;
@@ -287,6 +310,10 @@ struct nvme_ns {
struct nvme_ctrl *ctrl;
struct request_queue *queue;
struct gendisk *disk;
+#ifdef CONFIG_NVME_MULTIPATH
+ enum nvme_ana_state ana_state;
+ u32 ana_grpid;
+#endif
struct list_head siblings;
struct nvm_dev *ndev;
struct kref kref;
@@ -299,8 +326,9 @@ struct nvme_ns {
bool ext;
u8 pi_type;
unsigned long flags;
-#define NVME_NS_REMOVING 0
-#define NVME_NS_DEAD 1
+#define NVME_NS_REMOVING 0
+#define NVME_NS_DEAD 1
+#define NVME_NS_ANA_PENDING 2
u16 noiob;
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
@@ -356,14 +384,6 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector)
return (sector >> (ns->lba_shift - 9));
}
-static inline void nvme_cleanup_cmd(struct request *req)
-{
- if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
- kfree(page_address(req->special_vec.bv_page) +
- req->special_vec.bv_offset);
- }
-}
-
static inline void nvme_end_request(struct request *req, __le16 status,
union nvme_result result)
{
@@ -420,6 +440,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl);
#define NVME_QID_ANY -1
struct request *nvme_alloc_request(struct request_queue *q,
struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid);
+void nvme_cleanup_cmd(struct request *req);
blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmd);
int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
@@ -435,21 +456,24 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
-int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
- u8 log_page, void *log, size_t size, u64 offset);
+int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
+ void *log, size_t size, u64 offset);
extern const struct attribute_group nvme_ns_id_attr_group;
extern const struct block_device_operations nvme_ns_head_ops;
#ifdef CONFIG_NVME_MULTIPATH
+bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl);
void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
struct nvme_ctrl *ctrl, int *flags);
void nvme_failover_req(struct request *req);
-bool nvme_req_needs_failover(struct request *req, blk_status_t error);
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
-void nvme_mpath_add_disk(struct nvme_ns_head *head);
+void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
void nvme_mpath_remove_disk(struct nvme_ns_head *head);
+int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
+void nvme_mpath_uninit(struct nvme_ctrl *ctrl);
+void nvme_mpath_stop(struct nvme_ctrl *ctrl);
static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
{
@@ -468,7 +492,14 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
kblockd_schedule_work(&head->requeue_work);
}
+extern struct device_attribute dev_attr_ana_grpid;
+extern struct device_attribute dev_attr_ana_state;
+
#else
+static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
+{
+ return false;
+}
/*
* Without the multipath code enabled, multiple controller per subsystems are
* visible as devices and thus we cannot use the subsystem instance.
@@ -482,11 +513,6 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
static inline void nvme_failover_req(struct request *req)
{
}
-static inline bool nvme_req_needs_failover(struct request *req,
- blk_status_t error)
-{
- return false;
-}
static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
{
}
@@ -495,7 +521,8 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,
{
return 0;
}
-static inline void nvme_mpath_add_disk(struct nvme_ns_head *head)
+static inline void nvme_mpath_add_disk(struct nvme_ns *ns,
+ struct nvme_id_ns *id)
{
}
static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
@@ -507,6 +534,17 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
{
}
+static inline int nvme_mpath_init(struct nvme_ctrl *ctrl,
+ struct nvme_id_ctrl *id)
+{
+ return 0;
+}
+static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
+{
+}
+static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl)
+{
+}
#endif /* CONFIG_NVME_MULTIPATH */
#ifdef CONFIG_NVM
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ddd441b1516a..1b9951d2067e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -418,6 +418,8 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
BUG_ON(!nvmeq);
iod->nvmeq = nvmeq;
+
+ nvme_req(req)->ctrl = &dev->ctrl;
return 0;
}
@@ -535,73 +537,6 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
mempool_free(iod->sg, dev->iod_mempool);
}
-#ifdef CONFIG_BLK_DEV_INTEGRITY
-static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
-{
- if (be32_to_cpu(pi->ref_tag) == v)
- pi->ref_tag = cpu_to_be32(p);
-}
-
-static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
-{
- if (be32_to_cpu(pi->ref_tag) == p)
- pi->ref_tag = cpu_to_be32(v);
-}
-
-/**
- * nvme_dif_remap - remaps ref tags to bip seed and physical lba
- *
- * The virtual start sector is the one that was originally submitted by the
- * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical
- * start sector may be different. Remap protection information to match the
- * physical LBA on writes, and back to the original seed on reads.
- *
- * Type 0 and 3 do not have a ref tag, so no remapping required.
- */
-static void nvme_dif_remap(struct request *req,
- void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
-{
- struct nvme_ns *ns = req->rq_disk->private_data;
- struct bio_integrity_payload *bip;
- struct t10_pi_tuple *pi;
- void *p, *pmap;
- u32 i, nlb, ts, phys, virt;
-
- if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3)
- return;
-
- bip = bio_integrity(req->bio);
- if (!bip)
- return;
-
- pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset;
-
- p = pmap;
- virt = bip_get_seed(bip);
- phys = nvme_block_nr(ns, blk_rq_pos(req));
- nlb = (blk_rq_bytes(req) >> ns->lba_shift);
- ts = ns->disk->queue->integrity.tuple_size;
-
- for (i = 0; i < nlb; i++, virt++, phys++) {
- pi = (struct t10_pi_tuple *)p;
- dif_swap(phys, virt, pi);
- p += ts;
- }
- kunmap_atomic(pmap);
-}
-#else /* CONFIG_BLK_DEV_INTEGRITY */
-static void nvme_dif_remap(struct request *req,
- void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
-{
-}
-static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
-{
-}
-static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
-{
-}
-#endif
-
static void nvme_print_sgl(struct scatterlist *sgl, int nents)
{
int i;
@@ -827,9 +762,6 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1)
goto out_unmap;
- if (req_op(req) == REQ_OP_WRITE)
- nvme_dif_remap(req, nvme_dif_prep);
-
if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir))
goto out_unmap;
}
@@ -852,11 +784,8 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
if (iod->nents) {
dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
- if (blk_integrity_rq(req)) {
- if (req_op(req) == REQ_OP_READ)
- nvme_dif_remap(req, nvme_dif_complete);
+ if (blk_integrity_rq(req))
dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir);
- }
}
nvme_cleanup_cmd(req);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 66ec5985c9f3..0805fa6215ee 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -40,13 +40,14 @@
#define NVME_RDMA_MAX_SEGMENTS 256
-#define NVME_RDMA_MAX_INLINE_SEGMENTS 1
+#define NVME_RDMA_MAX_INLINE_SEGMENTS 4
struct nvme_rdma_device {
struct ib_device *dev;
struct ib_pd *pd;
struct kref ref;
struct list_head entry;
+ unsigned int num_inline_segments;
};
struct nvme_rdma_qe {
@@ -117,6 +118,7 @@ struct nvme_rdma_ctrl {
struct sockaddr_storage src_addr;
struct nvme_ctrl ctrl;
+ bool use_inline_data;
};
static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
@@ -249,7 +251,7 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
/* +1 for drain */
init_attr.cap.max_recv_wr = queue->queue_size + 1;
init_attr.cap.max_recv_sge = 1;
- init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS;
+ init_attr.cap.max_send_sge = 1 + dev->num_inline_segments;
init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
init_attr.qp_type = IB_QPT_RC;
init_attr.send_cq = queue->ib_cq;
@@ -286,6 +288,7 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
struct ib_device *ibdev = dev->dev;
int ret;
+ nvme_req(rq)->ctrl = &ctrl->ctrl;
ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
DMA_TO_DEVICE);
if (ret)
@@ -374,6 +377,8 @@ nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
goto out_free_pd;
}
+ ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS,
+ ndev->dev->attrs.max_sge - 1);
list_add(&ndev->entry, &device_list);
out_unlock:
mutex_unlock(&device_list_mutex);
@@ -868,6 +873,31 @@ out_free_io_queues:
return ret;
}
+static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
+ bool remove)
+{
+ blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+ nvme_rdma_stop_queue(&ctrl->queues[0]);
+ blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request,
+ &ctrl->ctrl);
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+ nvme_rdma_destroy_admin_queue(ctrl, remove);
+}
+
+static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
+ bool remove)
+{
+ if (ctrl->ctrl.queue_count > 1) {
+ nvme_stop_queues(&ctrl->ctrl);
+ nvme_rdma_stop_io_queues(ctrl);
+ blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request,
+ &ctrl->ctrl);
+ if (remove)
+ nvme_start_queues(&ctrl->ctrl);
+ nvme_rdma_destroy_io_queues(ctrl, remove);
+ }
+}
+
static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
@@ -912,21 +942,44 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
}
}
-static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
+static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
{
- struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
- struct nvme_rdma_ctrl, reconnect_work);
+ int ret = -EINVAL;
bool changed;
- int ret;
- ++ctrl->ctrl.nr_reconnects;
-
- ret = nvme_rdma_configure_admin_queue(ctrl, false);
+ ret = nvme_rdma_configure_admin_queue(ctrl, new);
if (ret)
- goto requeue;
+ return ret;
+
+ if (ctrl->ctrl.icdoff) {
+ dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
+ goto destroy_admin;
+ }
+
+ if (!(ctrl->ctrl.sgls & (1 << 2))) {
+ dev_err(ctrl->ctrl.device,
+ "Mandatory keyed sgls are not supported!\n");
+ goto destroy_admin;
+ }
+
+ if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) {
+ dev_warn(ctrl->ctrl.device,
+ "queue_size %zu > ctrl sqsize %u, clamping down\n",
+ ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1);
+ }
+
+ if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
+ dev_warn(ctrl->ctrl.device,
+ "sqsize %u > ctrl maxcmd %u, clamping down\n",
+ ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
+ ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
+ }
+
+ if (ctrl->ctrl.sgls & (1 << 20))
+ ctrl->use_inline_data = true;
if (ctrl->ctrl.queue_count > 1) {
- ret = nvme_rdma_configure_io_queues(ctrl, false);
+ ret = nvme_rdma_configure_io_queues(ctrl, new);
if (ret)
goto destroy_admin;
}
@@ -935,10 +988,31 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
if (!changed) {
/* state change failure is ok if we're in DELETING state */
WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
- return;
+ ret = -EINVAL;
+ goto destroy_io;
}
nvme_start_ctrl(&ctrl->ctrl);
+ return 0;
+
+destroy_io:
+ if (ctrl->ctrl.queue_count > 1)
+ nvme_rdma_destroy_io_queues(ctrl, new);
+destroy_admin:
+ nvme_rdma_stop_queue(&ctrl->queues[0]);
+ nvme_rdma_destroy_admin_queue(ctrl, new);
+ return ret;
+}
+
+static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
+{
+ struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
+ struct nvme_rdma_ctrl, reconnect_work);
+
+ ++ctrl->ctrl.nr_reconnects;
+
+ if (nvme_rdma_setup_ctrl(ctrl, false))
+ goto requeue;
dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
ctrl->ctrl.nr_reconnects);
@@ -947,9 +1021,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
return;
-destroy_admin:
- nvme_rdma_stop_queue(&ctrl->queues[0]);
- nvme_rdma_destroy_admin_queue(ctrl, false);
requeue:
dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
ctrl->ctrl.nr_reconnects);
@@ -962,27 +1033,9 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
struct nvme_rdma_ctrl, err_work);
nvme_stop_keep_alive(&ctrl->ctrl);
-
- if (ctrl->ctrl.queue_count > 1) {
- nvme_stop_queues(&ctrl->ctrl);
- nvme_rdma_stop_io_queues(ctrl);
- blk_mq_tagset_busy_iter(&ctrl->tag_set,
- nvme_cancel_request, &ctrl->ctrl);
- nvme_rdma_destroy_io_queues(ctrl, false);
- }
-
- blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
- nvme_rdma_stop_queue(&ctrl->queues[0]);
- blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
- nvme_cancel_request, &ctrl->ctrl);
- nvme_rdma_destroy_admin_queue(ctrl, false);
-
- /*
- * queues are not a live anymore, so restart the queues to fail fast
- * new IO
- */
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+ nvme_rdma_teardown_io_queues(ctrl, false);
nvme_start_queues(&ctrl->ctrl);
+ nvme_rdma_teardown_admin_queue(ctrl, false);
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we're in DELETING state */
@@ -1090,19 +1143,27 @@ static int nvme_rdma_set_sg_null(struct nvme_command *c)
}
static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
- struct nvme_rdma_request *req, struct nvme_command *c)
+ struct nvme_rdma_request *req, struct nvme_command *c,
+ int count)
{
struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+ struct scatterlist *sgl = req->sg_table.sgl;
+ struct ib_sge *sge = &req->sge[1];
+ u32 len = 0;
+ int i;
- req->sge[1].addr = sg_dma_address(req->sg_table.sgl);
- req->sge[1].length = sg_dma_len(req->sg_table.sgl);
- req->sge[1].lkey = queue->device->pd->local_dma_lkey;
+ for (i = 0; i < count; i++, sgl++, sge++) {
+ sge->addr = sg_dma_address(sgl);
+ sge->length = sg_dma_len(sgl);
+ sge->lkey = queue->device->pd->local_dma_lkey;
+ len += sge->length;
+ }
sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
- sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
+ sg->length = cpu_to_le32(len);
sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
- req->num_sge++;
+ req->num_sge += count;
return 0;
}
@@ -1195,15 +1256,16 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
goto out_free_table;
}
- if (count == 1) {
+ if (count <= dev->num_inline_segments) {
if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) &&
+ queue->ctrl->use_inline_data &&
blk_rq_payload_bytes(rq) <=
nvme_rdma_inline_data_size(queue)) {
- ret = nvme_rdma_map_sg_inline(queue, req, c);
+ ret = nvme_rdma_map_sg_inline(queue, req, c, count);
goto out;
}
- if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
+ if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
ret = nvme_rdma_map_sg_single(queue, req, c);
goto out;
}
@@ -1574,6 +1636,7 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
case RDMA_CM_EVENT_CONNECT_ERROR:
case RDMA_CM_EVENT_UNREACHABLE:
nvme_rdma_destroy_queue_ib(queue);
+ /* fall through */
case RDMA_CM_EVENT_ADDR_ERROR:
dev_dbg(queue->ctrl->ctrl.device,
"CM error event %d\n", ev->event);
@@ -1736,25 +1799,12 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
{
- if (ctrl->ctrl.queue_count > 1) {
- nvme_stop_queues(&ctrl->ctrl);
- nvme_rdma_stop_io_queues(ctrl);
- blk_mq_tagset_busy_iter(&ctrl->tag_set,
- nvme_cancel_request, &ctrl->ctrl);
- nvme_rdma_destroy_io_queues(ctrl, shutdown);
- }
-
+ nvme_rdma_teardown_io_queues(ctrl, shutdown);
if (shutdown)
nvme_shutdown_ctrl(&ctrl->ctrl);
else
nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
-
- blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
- nvme_rdma_stop_queue(&ctrl->queues[0]);
- blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
- nvme_cancel_request, &ctrl->ctrl);
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
- nvme_rdma_destroy_admin_queue(ctrl, shutdown);
+ nvme_rdma_teardown_admin_queue(ctrl, shutdown);
}
static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
@@ -1766,8 +1816,6 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
{
struct nvme_rdma_ctrl *ctrl =
container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work);
- int ret;
- bool changed;
nvme_stop_ctrl(&ctrl->ctrl);
nvme_rdma_shutdown_ctrl(ctrl, false);
@@ -1778,25 +1826,9 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
return;
}
- ret = nvme_rdma_configure_admin_queue(ctrl, false);
- if (ret)
+ if (nvme_rdma_setup_ctrl(ctrl, false))
goto out_fail;
- if (ctrl->ctrl.queue_count > 1) {
- ret = nvme_rdma_configure_io_queues(ctrl, false);
- if (ret)
- goto out_fail;
- }
-
- changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
- if (!changed) {
- /* state change failure is ok if we're in DELETING state */
- WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
- return;
- }
-
- nvme_start_ctrl(&ctrl->ctrl);
-
return;
out_fail:
@@ -1959,49 +1991,10 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING);
WARN_ON_ONCE(!changed);
- ret = nvme_rdma_configure_admin_queue(ctrl, true);
+ ret = nvme_rdma_setup_ctrl(ctrl, true);
if (ret)
goto out_uninit_ctrl;
- /* sanity check icdoff */
- if (ctrl->ctrl.icdoff) {
- dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
- ret = -EINVAL;
- goto out_remove_admin_queue;
- }
-
- /* sanity check keyed sgls */
- if (!(ctrl->ctrl.sgls & (1 << 2))) {
- dev_err(ctrl->ctrl.device,
- "Mandatory keyed sgls are not supported!\n");
- ret = -EINVAL;
- goto out_remove_admin_queue;
- }
-
- /* only warn if argument is too large here, will clamp later */
- if (opts->queue_size > ctrl->ctrl.sqsize + 1) {
- dev_warn(ctrl->ctrl.device,
- "queue_size %zu > ctrl sqsize %u, clamping down\n",
- opts->queue_size, ctrl->ctrl.sqsize + 1);
- }
-
- /* warn if maxcmd is lower than sqsize+1 */
- if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
- dev_warn(ctrl->ctrl.device,
- "sqsize %u > ctrl maxcmd %u, clamping down\n",
- ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
- ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
- }
-
- if (opts->nr_io_queues) {
- ret = nvme_rdma_configure_io_queues(ctrl, true);
- if (ret)
- goto out_remove_admin_queue;
- }
-
- changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
- WARN_ON_ONCE(!changed);
-
dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
@@ -2011,13 +2004,8 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
mutex_unlock(&nvme_rdma_ctrl_mutex);
- nvme_start_ctrl(&ctrl->ctrl);
-
return &ctrl->ctrl;
-out_remove_admin_queue:
- nvme_rdma_stop_queue(&ctrl->queues[0]);
- nvme_rdma_destroy_admin_queue(ctrl, true);
out_uninit_ctrl:
nvme_uninit_ctrl(&ctrl->ctrl);
nvme_put_ctrl(&ctrl->ctrl);
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 41944bbef835..25b0e310f4a8 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -128,3 +128,14 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p,
return nvme_trace_common(p, cdw10);
}
}
+
+const char *nvme_trace_disk_name(struct trace_seq *p, char *name)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+
+ if (*name)
+ trace_seq_printf(p, "disk=%s, ", name);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
index 01390f0e1671..a490790d6691 100644
--- a/drivers/nvme/host/trace.h
+++ b/drivers/nvme/host/trace.h
@@ -50,13 +50,8 @@
nvme_admin_opcode_name(nvme_admin_security_recv), \
nvme_admin_opcode_name(nvme_admin_sanitize_nvm))
-const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
- u8 *cdw10);
-#define __parse_nvme_admin_cmd(opcode, cdw10) \
- nvme_trace_parse_admin_cmd(p, opcode, cdw10)
-
#define nvme_opcode_name(opcode) { opcode, #opcode }
-#define show_opcode_name(val) \
+#define show_nvm_opcode_name(val) \
__print_symbolic(val, \
nvme_opcode_name(nvme_cmd_flush), \
nvme_opcode_name(nvme_cmd_write), \
@@ -70,85 +65,92 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
nvme_opcode_name(nvme_cmd_resv_acquire), \
nvme_opcode_name(nvme_cmd_resv_release))
-const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode,
- u8 *cdw10);
-#define __parse_nvme_cmd(opcode, cdw10) \
- nvme_trace_parse_nvm_cmd(p, opcode, cdw10)
-
-TRACE_EVENT(nvme_setup_admin_cmd,
- TP_PROTO(struct nvme_command *cmd),
- TP_ARGS(cmd),
- TP_STRUCT__entry(
- __field(u8, opcode)
- __field(u8, flags)
- __field(u16, cid)
- __field(u64, metadata)
- __array(u8, cdw10, 24)
- ),
- TP_fast_assign(
- __entry->opcode = cmd->common.opcode;
- __entry->flags = cmd->common.flags;
- __entry->cid = cmd->common.command_id;
- __entry->metadata = le64_to_cpu(cmd->common.metadata);
- memcpy(__entry->cdw10, cmd->common.cdw10,
- sizeof(__entry->cdw10));
- ),
- TP_printk(" cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
- __entry->cid, __entry->flags, __entry->metadata,
- show_admin_opcode_name(__entry->opcode),
- __parse_nvme_admin_cmd(__entry->opcode, __entry->cdw10))
-);
-
+#define show_opcode_name(qid, opcode) \
+ (qid ? show_nvm_opcode_name(opcode) : show_admin_opcode_name(opcode))
-TRACE_EVENT(nvme_setup_nvm_cmd,
- TP_PROTO(int qid, struct nvme_command *cmd),
- TP_ARGS(qid, cmd),
+const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
+ u8 *cdw10);
+const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode,
+ u8 *cdw10);
+
+#define parse_nvme_cmd(qid, opcode, cdw10) \
+ (qid ? \
+ nvme_trace_parse_nvm_cmd(p, opcode, cdw10) : \
+ nvme_trace_parse_admin_cmd(p, opcode, cdw10))
+
+const char *nvme_trace_disk_name(struct trace_seq *p, char *name);
+#define __print_disk_name(name) \
+ nvme_trace_disk_name(p, name)
+
+#ifndef TRACE_HEADER_MULTI_READ
+static inline void __assign_disk_name(char *name, struct gendisk *disk)
+{
+ if (disk)
+ memcpy(name, disk->disk_name, DISK_NAME_LEN);
+ else
+ memset(name, 0, DISK_NAME_LEN);
+}
+#endif
+
+TRACE_EVENT(nvme_setup_cmd,
+ TP_PROTO(struct request *req, struct nvme_command *cmd),
+ TP_ARGS(req, cmd),
TP_STRUCT__entry(
- __field(int, qid)
- __field(u8, opcode)
- __field(u8, flags)
- __field(u16, cid)
- __field(u32, nsid)
- __field(u64, metadata)
- __array(u8, cdw10, 24)
+ __array(char, disk, DISK_NAME_LEN)
+ __field(int, ctrl_id)
+ __field(int, qid)
+ __field(u8, opcode)
+ __field(u8, flags)
+ __field(u16, cid)
+ __field(u32, nsid)
+ __field(u64, metadata)
+ __array(u8, cdw10, 24)
),
TP_fast_assign(
- __entry->qid = qid;
- __entry->opcode = cmd->common.opcode;
- __entry->flags = cmd->common.flags;
- __entry->cid = cmd->common.command_id;
- __entry->nsid = le32_to_cpu(cmd->common.nsid);
- __entry->metadata = le64_to_cpu(cmd->common.metadata);
- memcpy(__entry->cdw10, cmd->common.cdw10,
- sizeof(__entry->cdw10));
+ __entry->ctrl_id = nvme_req(req)->ctrl->instance;
+ __entry->qid = nvme_req_qid(req);
+ __entry->opcode = cmd->common.opcode;
+ __entry->flags = cmd->common.flags;
+ __entry->cid = cmd->common.command_id;
+ __entry->nsid = le32_to_cpu(cmd->common.nsid);
+ __entry->metadata = le64_to_cpu(cmd->common.metadata);
+ __assign_disk_name(__entry->disk, req->rq_disk);
+ memcpy(__entry->cdw10, cmd->common.cdw10,
+ sizeof(__entry->cdw10));
),
- TP_printk("qid=%d, nsid=%u, cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
- __entry->qid, __entry->nsid, __entry->cid,
+ TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
+ __entry->ctrl_id, __print_disk_name(__entry->disk),
+ __entry->qid, __entry->cid, __entry->nsid,
__entry->flags, __entry->metadata,
- show_opcode_name(__entry->opcode),
- __parse_nvme_cmd(__entry->opcode, __entry->cdw10))
+ show_opcode_name(__entry->qid, __entry->opcode),
+ parse_nvme_cmd(__entry->qid, __entry->opcode, __entry->cdw10))
);
TRACE_EVENT(nvme_complete_rq,
TP_PROTO(struct request *req),
TP_ARGS(req),
TP_STRUCT__entry(
- __field(int, qid)
- __field(int, cid)
- __field(u64, result)
- __field(u8, retries)
- __field(u8, flags)
- __field(u16, status)
+ __array(char, disk, DISK_NAME_LEN)
+ __field(int, ctrl_id)
+ __field(int, qid)
+ __field(int, cid)
+ __field(u64, result)
+ __field(u8, retries)
+ __field(u8, flags)
+ __field(u16, status)
),
TP_fast_assign(
- __entry->qid = req->q->id;
- __entry->cid = req->tag;
- __entry->result = le64_to_cpu(nvme_req(req)->result.u64);
- __entry->retries = nvme_req(req)->retries;
- __entry->flags = nvme_req(req)->flags;
- __entry->status = nvme_req(req)->status;
+ __entry->ctrl_id = nvme_req(req)->ctrl->instance;
+ __entry->qid = nvme_req_qid(req);
+ __entry->cid = req->tag;
+ __entry->result = le64_to_cpu(nvme_req(req)->result.u64);
+ __entry->retries = nvme_req(req)->retries;
+ __entry->flags = nvme_req(req)->flags;
+ __entry->status = nvme_req(req)->status;
+ __assign_disk_name(__entry->disk, req->rq_disk);
),
- TP_printk("qid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u",
+ TP_printk("nvme%d: %sqid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u",
+ __entry->ctrl_id, __print_disk_name(__entry->disk),
__entry->qid, __entry->cid, __entry->result,
__entry->retries, __entry->flags, __entry->status)
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 38803576d5e1..a21caea1e080 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -19,6 +19,19 @@
#include <asm/unaligned.h>
#include "nvmet.h"
+/*
+ * This helper allows us to clear the AEN based on the RAE bit,
+ * Please use this helper when processing the log pages which are
+ * associated with the AEN.
+ */
+static inline void nvmet_clear_aen(struct nvmet_req *req, u32 aen_bit)
+{
+ int rae = le32_to_cpu(req->cmd->common.cdw10[0]) & 1 << 15;
+
+ if (!rae)
+ clear_bit(aen_bit, &req->sq->ctrl->aen_masked);
+}
+
u32 nvmet_get_log_page_len(struct nvme_command *cmd)
{
u32 len = le16_to_cpu(cmd->get_log_page.numdu);
@@ -128,6 +141,36 @@ out:
nvmet_req_complete(req, status);
}
+static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req)
+{
+ u16 status = NVME_SC_INTERNAL;
+ struct nvme_effects_log *log;
+
+ log = kzalloc(sizeof(*log), GFP_KERNEL);
+ if (!log)
+ goto out;
+
+ log->acs[nvme_admin_get_log_page] = cpu_to_le32(1 << 0);
+ log->acs[nvme_admin_identify] = cpu_to_le32(1 << 0);
+ log->acs[nvme_admin_abort_cmd] = cpu_to_le32(1 << 0);
+ log->acs[nvme_admin_set_features] = cpu_to_le32(1 << 0);
+ log->acs[nvme_admin_get_features] = cpu_to_le32(1 << 0);
+ log->acs[nvme_admin_async_event] = cpu_to_le32(1 << 0);
+ log->acs[nvme_admin_keep_alive] = cpu_to_le32(1 << 0);
+
+ log->iocs[nvme_cmd_read] = cpu_to_le32(1 << 0);
+ log->iocs[nvme_cmd_write] = cpu_to_le32(1 << 0);
+ log->iocs[nvme_cmd_flush] = cpu_to_le32(1 << 0);
+ log->iocs[nvme_cmd_dsm] = cpu_to_le32(1 << 0);
+ log->iocs[nvme_cmd_write_zeroes] = cpu_to_le32(1 << 0);
+
+ status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log));
+
+ kfree(log);
+out:
+ nvmet_req_complete(req, status);
+}
+
static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
@@ -146,12 +189,76 @@ static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req)
if (!status)
status = nvmet_zero_sgl(req, len, req->data_len - len);
ctrl->nr_changed_ns = 0;
- clear_bit(NVME_AEN_CFG_NS_ATTR, &ctrl->aen_masked);
+ nvmet_clear_aen(req, NVME_AEN_CFG_NS_ATTR);
mutex_unlock(&ctrl->lock);
out:
nvmet_req_complete(req, status);
}
+static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid,
+ struct nvme_ana_group_desc *desc)
+{
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ struct nvmet_ns *ns;
+ u32 count = 0;
+
+ if (!(req->cmd->get_log_page.lsp & NVME_ANA_LOG_RGO)) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link)
+ if (ns->anagrpid == grpid)
+ desc->nsids[count++] = cpu_to_le32(ns->nsid);
+ rcu_read_unlock();
+ }
+
+ desc->grpid = cpu_to_le32(grpid);
+ desc->nnsids = cpu_to_le32(count);
+ desc->chgcnt = cpu_to_le64(nvmet_ana_chgcnt);
+ desc->state = req->port->ana_state[grpid];
+ memset(desc->rsvd17, 0, sizeof(desc->rsvd17));
+ return sizeof(struct nvme_ana_group_desc) + count * sizeof(__le32);
+}
+
+static void nvmet_execute_get_log_page_ana(struct nvmet_req *req)
+{
+ struct nvme_ana_rsp_hdr hdr = { 0, };
+ struct nvme_ana_group_desc *desc;
+ size_t offset = sizeof(struct nvme_ana_rsp_hdr); /* start beyond hdr */
+ size_t len;
+ u32 grpid;
+ u16 ngrps = 0;
+ u16 status;
+
+ status = NVME_SC_INTERNAL;
+ desc = kmalloc(sizeof(struct nvme_ana_group_desc) +
+ NVMET_MAX_NAMESPACES * sizeof(__le32), GFP_KERNEL);
+ if (!desc)
+ goto out;
+
+ down_read(&nvmet_ana_sem);
+ for (grpid = 1; grpid <= NVMET_MAX_ANAGRPS; grpid++) {
+ if (!nvmet_ana_group_enabled[grpid])
+ continue;
+ len = nvmet_format_ana_group(req, grpid, desc);
+ status = nvmet_copy_to_sgl(req, offset, desc, len);
+ if (status)
+ break;
+ offset += len;
+ ngrps++;
+ }
+
+ hdr.chgcnt = cpu_to_le64(nvmet_ana_chgcnt);
+ hdr.ngrps = cpu_to_le16(ngrps);
+ nvmet_clear_aen(req, NVME_AEN_CFG_ANA_CHANGE);
+ up_read(&nvmet_ana_sem);
+
+ kfree(desc);
+
+ /* copy the header last once we know the number of groups */
+ status = nvmet_copy_to_sgl(req, 0, &hdr, sizeof(hdr));
+out:
+ nvmet_req_complete(req, status);
+}
+
static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
@@ -183,8 +290,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
* the safest is to leave it as zeroes.
*/
- /* we support multiple ports and multiples hosts: */
- id->cmic = (1 << 0) | (1 << 1);
+ /* we support multiple ports, multiples hosts and ANA: */
+ id->cmic = (1 << 0) | (1 << 1) | (1 << 3);
/* no limit on data transfer sizes for now */
id->mdts = 0;
@@ -208,7 +315,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
/* first slot is read-only, only one slot supported */
id->frmw = (1 << 0) | (1 << 1);
- id->lpa = (1 << 0) | (1 << 2);
+ id->lpa = (1 << 0) | (1 << 1) | (1 << 2);
id->elpe = NVMET_ERROR_LOG_SLOTS - 1;
id->npss = 0;
@@ -222,6 +329,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
id->nn = cpu_to_le32(ctrl->subsys->max_nsid);
+ id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES);
id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM |
NVME_CTRL_ONCS_WRITE_ZEROES);
@@ -238,19 +346,24 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
if (ctrl->ops->has_keyed_sgls)
id->sgls |= cpu_to_le32(1 << 2);
- if (ctrl->ops->sqe_inline_size)
+ if (req->port->inline_data_size)
id->sgls |= cpu_to_le32(1 << 20);
strcpy(id->subnqn, ctrl->subsys->subsysnqn);
/* Max command capsule size is sqe + single page of in-capsule data */
id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) +
- ctrl->ops->sqe_inline_size) / 16);
+ req->port->inline_data_size) / 16);
/* Max response capsule size is cqe */
id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16);
id->msdbd = ctrl->ops->msdbd;
+ id->anacap = (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3) | (1 << 4);
+ id->anatt = 10; /* random value */
+ id->anagrpmax = cpu_to_le32(NVMET_MAX_ANAGRPS);
+ id->nanagrpid = cpu_to_le32(NVMET_MAX_ANAGRPS);
+
/*
* Meh, we don't really support any power state. Fake up the same
* values that qemu does.
@@ -259,6 +372,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
id->psd[0].entry_lat = cpu_to_le32(0x10);
id->psd[0].exit_lat = cpu_to_le32(0x4);
+ id->nwpc = 1 << 0; /* write protect and no write protect */
+
status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
kfree(id);
@@ -292,8 +407,15 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
* nuse = ncap = nsze isn't always true, but we have no way to find
* that out from the underlying device.
*/
- id->ncap = id->nuse = id->nsze =
- cpu_to_le64(ns->size >> ns->blksize_shift);
+ id->ncap = id->nsze = cpu_to_le64(ns->size >> ns->blksize_shift);
+ switch (req->port->ana_state[ns->anagrpid]) {
+ case NVME_ANA_INACCESSIBLE:
+ case NVME_ANA_PERSISTENT_LOSS:
+ break;
+ default:
+ id->nuse = id->nsze;
+ break;
+ }
/*
* We just provide a single LBA format that matches what the
@@ -307,11 +429,14 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
* controllers, but also with any other user of the block device.
*/
id->nmic = (1 << 0);
+ id->anagrpid = cpu_to_le32(ns->anagrpid);
- memcpy(&id->nguid, &ns->nguid, sizeof(uuid_le));
+ memcpy(&id->nguid, &ns->nguid, sizeof(id->nguid));
id->lbaf[0].ds = ns->blksize_shift;
+ if (ns->readonly)
+ id->nsattr |= (1 << 0);
nvmet_put_namespace(ns);
done:
status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
@@ -424,6 +549,52 @@ static void nvmet_execute_abort(struct nvmet_req *req)
nvmet_req_complete(req, 0);
}
+static u16 nvmet_write_protect_flush_sync(struct nvmet_req *req)
+{
+ u16 status;
+
+ if (req->ns->file)
+ status = nvmet_file_flush(req);
+ else
+ status = nvmet_bdev_flush(req);
+
+ if (status)
+ pr_err("write protect flush failed nsid: %u\n", req->ns->nsid);
+ return status;
+}
+
+static u16 nvmet_set_feat_write_protect(struct nvmet_req *req)
+{
+ u32 write_protect = le32_to_cpu(req->cmd->common.cdw10[1]);
+ struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
+ u16 status = NVME_SC_FEATURE_NOT_CHANGEABLE;
+
+ req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->rw.nsid);
+ if (unlikely(!req->ns))
+ return status;
+
+ mutex_lock(&subsys->lock);
+ switch (write_protect) {
+ case NVME_NS_WRITE_PROTECT:
+ req->ns->readonly = true;
+ status = nvmet_write_protect_flush_sync(req);
+ if (status)
+ req->ns->readonly = false;
+ break;
+ case NVME_NS_NO_WRITE_PROTECT:
+ req->ns->readonly = false;
+ status = 0;
+ break;
+ default:
+ break;
+ }
+
+ if (!status)
+ nvmet_ns_changed(subsys, req->ns->nsid);
+ mutex_unlock(&subsys->lock);
+ return status;
+}
+
static void nvmet_execute_set_features(struct nvmet_req *req)
{
struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
@@ -454,6 +625,9 @@ static void nvmet_execute_set_features(struct nvmet_req *req)
case NVME_FEAT_HOST_ID:
status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
break;
+ case NVME_FEAT_WRITE_PROTECT:
+ status = nvmet_set_feat_write_protect(req);
+ break;
default:
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
break;
@@ -462,6 +636,26 @@ static void nvmet_execute_set_features(struct nvmet_req *req)
nvmet_req_complete(req, status);
}
+static u16 nvmet_get_feat_write_protect(struct nvmet_req *req)
+{
+ struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
+ u32 result;
+
+ req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->common.nsid);
+ if (!req->ns)
+ return NVME_SC_INVALID_NS | NVME_SC_DNR;
+
+ mutex_lock(&subsys->lock);
+ if (req->ns->readonly == true)
+ result = NVME_NS_WRITE_PROTECT;
+ else
+ result = NVME_NS_NO_WRITE_PROTECT;
+ nvmet_set_result(req, result);
+ mutex_unlock(&subsys->lock);
+
+ return 0;
+}
+
static void nvmet_execute_get_features(struct nvmet_req *req)
{
struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
@@ -513,6 +707,9 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
status = nvmet_copy_to_sgl(req, 0, &req->sq->ctrl->hostid,
sizeof(req->sq->ctrl->hostid));
break;
+ case NVME_FEAT_WRITE_PROTECT:
+ status = nvmet_get_feat_write_protect(req);
+ break;
default:
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
break;
@@ -586,6 +783,12 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
case NVME_LOG_CHANGED_NS:
req->execute = nvmet_execute_get_log_changed_ns;
return 0;
+ case NVME_LOG_CMD_EFFECTS:
+ req->execute = nvmet_execute_get_log_cmd_effects_ns;
+ return 0;
+ case NVME_LOG_ANA:
+ req->execute = nvmet_execute_get_log_page_ana;
+ return 0;
}
break;
case nvme_admin_identify:
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index ebea1373d1b7..b37a8e3e3f80 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -218,6 +218,35 @@ static ssize_t nvmet_addr_trsvcid_store(struct config_item *item,
CONFIGFS_ATTR(nvmet_, addr_trsvcid);
+static ssize_t nvmet_param_inline_data_size_show(struct config_item *item,
+ char *page)
+{
+ struct nvmet_port *port = to_nvmet_port(item);
+
+ return snprintf(page, PAGE_SIZE, "%d\n", port->inline_data_size);
+}
+
+static ssize_t nvmet_param_inline_data_size_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct nvmet_port *port = to_nvmet_port(item);
+ int ret;
+
+ if (port->enabled) {
+ pr_err("Cannot modify inline_data_size while port enabled\n");
+ pr_err("Disable the port before modifying\n");
+ return -EACCES;
+ }
+ ret = kstrtoint(page, 0, &port->inline_data_size);
+ if (ret) {
+ pr_err("Invalid value '%s' for inline_data_size\n", page);
+ return -EINVAL;
+ }
+ return count;
+}
+
+CONFIGFS_ATTR(nvmet_, param_inline_data_size);
+
static ssize_t nvmet_addr_trtype_show(struct config_item *item,
char *page)
{
@@ -387,6 +416,39 @@ out_unlock:
CONFIGFS_ATTR(nvmet_ns_, device_nguid);
+static ssize_t nvmet_ns_ana_grpid_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%u\n", to_nvmet_ns(item)->anagrpid);
+}
+
+static ssize_t nvmet_ns_ana_grpid_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct nvmet_ns *ns = to_nvmet_ns(item);
+ u32 oldgrpid, newgrpid;
+ int ret;
+
+ ret = kstrtou32(page, 0, &newgrpid);
+ if (ret)
+ return ret;
+
+ if (newgrpid < 1 || newgrpid > NVMET_MAX_ANAGRPS)
+ return -EINVAL;
+
+ down_write(&nvmet_ana_sem);
+ oldgrpid = ns->anagrpid;
+ nvmet_ana_group_enabled[newgrpid]++;
+ ns->anagrpid = newgrpid;
+ nvmet_ana_group_enabled[oldgrpid]--;
+ nvmet_ana_chgcnt++;
+ up_write(&nvmet_ana_sem);
+
+ nvmet_send_ana_event(ns->subsys, NULL);
+ return count;
+}
+
+CONFIGFS_ATTR(nvmet_ns_, ana_grpid);
+
static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page)
{
return sprintf(page, "%d\n", to_nvmet_ns(item)->enabled);
@@ -412,11 +474,41 @@ static ssize_t nvmet_ns_enable_store(struct config_item *item,
CONFIGFS_ATTR(nvmet_ns_, enable);
+static ssize_t nvmet_ns_buffered_io_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%d\n", to_nvmet_ns(item)->buffered_io);
+}
+
+static ssize_t nvmet_ns_buffered_io_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct nvmet_ns *ns = to_nvmet_ns(item);
+ bool val;
+
+ if (strtobool(page, &val))
+ return -EINVAL;
+
+ mutex_lock(&ns->subsys->lock);
+ if (ns->enabled) {
+ pr_err("disable ns before setting buffered_io value.\n");
+ mutex_unlock(&ns->subsys->lock);
+ return -EINVAL;
+ }
+
+ ns->buffered_io = val;
+ mutex_unlock(&ns->subsys->lock);
+ return count;
+}
+
+CONFIGFS_ATTR(nvmet_ns_, buffered_io);
+
static struct configfs_attribute *nvmet_ns_attrs[] = {
&nvmet_ns_attr_device_path,
&nvmet_ns_attr_device_nguid,
&nvmet_ns_attr_device_uuid,
+ &nvmet_ns_attr_ana_grpid,
&nvmet_ns_attr_enable,
+ &nvmet_ns_attr_buffered_io,
NULL,
};
@@ -863,6 +955,134 @@ static const struct config_item_type nvmet_referrals_type = {
.ct_group_ops = &nvmet_referral_group_ops,
};
+static struct {
+ enum nvme_ana_state state;
+ const char *name;
+} nvmet_ana_state_names[] = {
+ { NVME_ANA_OPTIMIZED, "optimized" },
+ { NVME_ANA_NONOPTIMIZED, "non-optimized" },
+ { NVME_ANA_INACCESSIBLE, "inaccessible" },
+ { NVME_ANA_PERSISTENT_LOSS, "persistent-loss" },
+ { NVME_ANA_CHANGE, "change" },
+};
+
+static ssize_t nvmet_ana_group_ana_state_show(struct config_item *item,
+ char *page)
+{
+ struct nvmet_ana_group *grp = to_ana_group(item);
+ enum nvme_ana_state state = grp->port->ana_state[grp->grpid];
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(nvmet_ana_state_names); i++) {
+ if (state != nvmet_ana_state_names[i].state)
+ continue;
+ return sprintf(page, "%s\n", nvmet_ana_state_names[i].name);
+ }
+
+ return sprintf(page, "\n");
+}
+
+static ssize_t nvmet_ana_group_ana_state_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct nvmet_ana_group *grp = to_ana_group(item);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(nvmet_ana_state_names); i++) {
+ if (sysfs_streq(page, nvmet_ana_state_names[i].name))
+ goto found;
+ }
+
+ pr_err("Invalid value '%s' for ana_state\n", page);
+ return -EINVAL;
+
+found:
+ down_write(&nvmet_ana_sem);
+ grp->port->ana_state[grp->grpid] = nvmet_ana_state_names[i].state;
+ nvmet_ana_chgcnt++;
+ up_write(&nvmet_ana_sem);
+
+ nvmet_port_send_ana_event(grp->port);
+ return count;
+}
+
+CONFIGFS_ATTR(nvmet_ana_group_, ana_state);
+
+static struct configfs_attribute *nvmet_ana_group_attrs[] = {
+ &nvmet_ana_group_attr_ana_state,
+ NULL,
+};
+
+static void nvmet_ana_group_release(struct config_item *item)
+{
+ struct nvmet_ana_group *grp = to_ana_group(item);
+
+ if (grp == &grp->port->ana_default_group)
+ return;
+
+ down_write(&nvmet_ana_sem);
+ grp->port->ana_state[grp->grpid] = NVME_ANA_INACCESSIBLE;
+ nvmet_ana_group_enabled[grp->grpid]--;
+ up_write(&nvmet_ana_sem);
+
+ nvmet_port_send_ana_event(grp->port);
+ kfree(grp);
+}
+
+static struct configfs_item_operations nvmet_ana_group_item_ops = {
+ .release = nvmet_ana_group_release,
+};
+
+static const struct config_item_type nvmet_ana_group_type = {
+ .ct_item_ops = &nvmet_ana_group_item_ops,
+ .ct_attrs = nvmet_ana_group_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_group *nvmet_ana_groups_make_group(
+ struct config_group *group, const char *name)
+{
+ struct nvmet_port *port = ana_groups_to_port(&group->cg_item);
+ struct nvmet_ana_group *grp;
+ u32 grpid;
+ int ret;
+
+ ret = kstrtou32(name, 0, &grpid);
+ if (ret)
+ goto out;
+
+ ret = -EINVAL;
+ if (grpid <= 1 || grpid > NVMET_MAX_ANAGRPS)
+ goto out;
+
+ ret = -ENOMEM;
+ grp = kzalloc(sizeof(*grp), GFP_KERNEL);
+ if (!grp)
+ goto out;
+ grp->port = port;
+ grp->grpid = grpid;
+
+ down_write(&nvmet_ana_sem);
+ nvmet_ana_group_enabled[grpid]++;
+ up_write(&nvmet_ana_sem);
+
+ nvmet_port_send_ana_event(grp->port);
+
+ config_group_init_type_name(&grp->group, name, &nvmet_ana_group_type);
+ return &grp->group;
+out:
+ return ERR_PTR(ret);
+}
+
+static struct configfs_group_operations nvmet_ana_groups_group_ops = {
+ .make_group = nvmet_ana_groups_make_group,
+};
+
+static const struct config_item_type nvmet_ana_groups_type = {
+ .ct_group_ops = &nvmet_ana_groups_group_ops,
+ .ct_owner = THIS_MODULE,
+};
+
/*
* Ports definitions.
*/
@@ -870,6 +1090,7 @@ static void nvmet_port_release(struct config_item *item)
{
struct nvmet_port *port = to_nvmet_port(item);
+ kfree(port->ana_state);
kfree(port);
}
@@ -879,6 +1100,7 @@ static struct configfs_attribute *nvmet_port_attrs[] = {
&nvmet_attr_addr_traddr,
&nvmet_attr_addr_trsvcid,
&nvmet_attr_addr_trtype,
+ &nvmet_attr_param_inline_data_size,
NULL,
};
@@ -897,6 +1119,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
{
struct nvmet_port *port;
u16 portid;
+ u32 i;
if (kstrtou16(name, 0, &portid))
return ERR_PTR(-EINVAL);
@@ -905,9 +1128,24 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
if (!port)
return ERR_PTR(-ENOMEM);
+ port->ana_state = kcalloc(NVMET_MAX_ANAGRPS + 1,
+ sizeof(*port->ana_state), GFP_KERNEL);
+ if (!port->ana_state) {
+ kfree(port);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (i = 1; i <= NVMET_MAX_ANAGRPS; i++) {
+ if (i == NVMET_DEFAULT_ANA_GRPID)
+ port->ana_state[1] = NVME_ANA_OPTIMIZED;
+ else
+ port->ana_state[i] = NVME_ANA_INACCESSIBLE;
+ }
+
INIT_LIST_HEAD(&port->entry);
INIT_LIST_HEAD(&port->subsystems);
INIT_LIST_HEAD(&port->referrals);
+ port->inline_data_size = -1; /* < 0 == let the transport choose */
port->disc_addr.portid = cpu_to_le16(portid);
config_group_init_type_name(&port->group, name, &nvmet_port_type);
@@ -920,6 +1158,18 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
"referrals", &nvmet_referrals_type);
configfs_add_default_group(&port->referrals_group, &port->group);
+ config_group_init_type_name(&port->ana_groups_group,
+ "ana_groups", &nvmet_ana_groups_type);
+ configfs_add_default_group(&port->ana_groups_group, &port->group);
+
+ port->ana_default_group.port = port;
+ port->ana_default_group.grpid = NVMET_DEFAULT_ANA_GRPID;
+ config_group_init_type_name(&port->ana_default_group.group,
+ __stringify(NVMET_DEFAULT_ANA_GRPID),
+ &nvmet_ana_group_type);
+ configfs_add_default_group(&port->ana_default_group.group,
+ &port->ana_groups_group);
+
return &port->group;
}
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 9838103f2d62..ebf3e7a6c49e 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -18,6 +18,7 @@
#include "nvmet.h"
+struct workqueue_struct *buffered_io_wq;
static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
static DEFINE_IDA(cntlid_ida);
@@ -39,6 +40,10 @@ static DEFINE_IDA(cntlid_ida);
*/
DECLARE_RWSEM(nvmet_config_sem);
+u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
+u64 nvmet_ana_chgcnt;
+DECLARE_RWSEM(nvmet_ana_sem);
+
static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
const char *subsysnqn);
@@ -175,7 +180,7 @@ out_unlock:
mutex_unlock(&ctrl->lock);
}
-static void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
+void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
{
struct nvmet_ctrl *ctrl;
@@ -189,6 +194,33 @@ static void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
}
}
+void nvmet_send_ana_event(struct nvmet_subsys *subsys,
+ struct nvmet_port *port)
+{
+ struct nvmet_ctrl *ctrl;
+
+ mutex_lock(&subsys->lock);
+ list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+ if (port && ctrl->port != port)
+ continue;
+ if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_ANA_CHANGE))
+ continue;
+ nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
+ NVME_AER_NOTICE_ANA, NVME_LOG_ANA);
+ }
+ mutex_unlock(&subsys->lock);
+}
+
+void nvmet_port_send_ana_event(struct nvmet_port *port)
+{
+ struct nvmet_subsys_link *p;
+
+ down_read(&nvmet_config_sem);
+ list_for_each_entry(p, &port->subsystems, entry)
+ nvmet_send_ana_event(p->subsys, port);
+ up_read(&nvmet_config_sem);
+}
+
int nvmet_register_transport(const struct nvmet_fabrics_ops *ops)
{
int ret = 0;
@@ -241,6 +273,10 @@ int nvmet_enable_port(struct nvmet_port *port)
return ret;
}
+ /* If the transport didn't set inline_data_size, then disable it. */
+ if (port->inline_data_size < 0)
+ port->inline_data_size = 0;
+
port->enabled = true;
return 0;
}
@@ -332,9 +368,13 @@ static void nvmet_ns_dev_disable(struct nvmet_ns *ns)
int nvmet_ns_enable(struct nvmet_ns *ns)
{
struct nvmet_subsys *subsys = ns->subsys;
- int ret = 0;
+ int ret;
mutex_lock(&subsys->lock);
+ ret = -EMFILE;
+ if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES)
+ goto out_unlock;
+ ret = 0;
if (ns->enabled)
goto out_unlock;
@@ -369,6 +409,7 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
list_add_tail_rcu(&ns->dev_link, &old->dev_link);
}
+ subsys->nr_namespaces++;
nvmet_ns_changed(subsys, ns->nsid);
ns->enabled = true;
@@ -409,6 +450,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
percpu_ref_exit(&ns->ref);
mutex_lock(&subsys->lock);
+ subsys->nr_namespaces--;
nvmet_ns_changed(subsys, ns->nsid);
nvmet_ns_dev_disable(ns);
out_unlock:
@@ -419,6 +461,10 @@ void nvmet_ns_free(struct nvmet_ns *ns)
{
nvmet_ns_disable(ns);
+ down_write(&nvmet_ana_sem);
+ nvmet_ana_group_enabled[ns->anagrpid]--;
+ up_write(&nvmet_ana_sem);
+
kfree(ns->device_path);
kfree(ns);
}
@@ -436,7 +482,14 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
ns->nsid = nsid;
ns->subsys = subsys;
+
+ down_write(&nvmet_ana_sem);
+ ns->anagrpid = NVMET_DEFAULT_ANA_GRPID;
+ nvmet_ana_group_enabled[ns->anagrpid]++;
+ up_write(&nvmet_ana_sem);
+
uuid_gen(&ns->uuid);
+ ns->buffered_io = false;
return ns;
}
@@ -542,6 +595,35 @@ int nvmet_sq_init(struct nvmet_sq *sq)
}
EXPORT_SYMBOL_GPL(nvmet_sq_init);
+static inline u16 nvmet_check_ana_state(struct nvmet_port *port,
+ struct nvmet_ns *ns)
+{
+ enum nvme_ana_state state = port->ana_state[ns->anagrpid];
+
+ if (unlikely(state == NVME_ANA_INACCESSIBLE))
+ return NVME_SC_ANA_INACCESSIBLE;
+ if (unlikely(state == NVME_ANA_PERSISTENT_LOSS))
+ return NVME_SC_ANA_PERSISTENT_LOSS;
+ if (unlikely(state == NVME_ANA_CHANGE))
+ return NVME_SC_ANA_TRANSITION;
+ return 0;
+}
+
+static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req)
+{
+ if (unlikely(req->ns->readonly)) {
+ switch (req->cmd->common.opcode) {
+ case nvme_cmd_read:
+ case nvme_cmd_flush:
+ break;
+ default:
+ return NVME_SC_NS_WRITE_PROTECTED;
+ }
+ }
+
+ return 0;
+}
+
static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
@@ -554,6 +636,12 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
if (unlikely(!req->ns))
return NVME_SC_INVALID_NS | NVME_SC_DNR;
+ ret = nvmet_check_ana_state(req->port, req->ns);
+ if (unlikely(ret))
+ return ret;
+ ret = nvmet_io_cmd_check_access(req);
+ if (unlikely(ret))
+ return ret;
if (req->ns->file)
return nvmet_file_parse_io_cmd(req);
@@ -870,6 +958,8 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
nvmet_init_cap(ctrl);
+ ctrl->port = req->port;
+
INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
INIT_LIST_HEAD(&ctrl->async_events);
@@ -1109,6 +1199,15 @@ static int __init nvmet_init(void)
{
int error;
+ nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1;
+
+ buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq",
+ WQ_MEM_RECLAIM, 0);
+ if (!buffered_io_wq) {
+ error = -ENOMEM;
+ goto out;
+ }
+
error = nvmet_init_discovery();
if (error)
goto out;
@@ -1129,6 +1228,7 @@ static void __exit nvmet_exit(void)
nvmet_exit_configfs();
nvmet_exit_discovery();
ida_destroy(&cntlid_ida);
+ destroy_workqueue(buffered_io_wq);
BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024);
BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024);
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index 08656b849bd6..eae29f493a07 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -171,7 +171,7 @@ static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req)
id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
if (ctrl->ops->has_keyed_sgls)
id->sgls |= cpu_to_le32(1 << 2);
- if (ctrl->ops->sqe_inline_size)
+ if (req->port->inline_data_size)
id->sgls |= cpu_to_le32(1 << 20);
strcpy(id->subnqn, ctrl->subsys->subsysnqn);
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index e0b0f7df70c2..7bc9f6240432 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -124,6 +124,13 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req)
submit_bio(bio);
}
+u16 nvmet_bdev_flush(struct nvmet_req *req)
+{
+ if (blkdev_issue_flush(req->ns->bdev, GFP_KERNEL, NULL))
+ return NVME_SC_INTERNAL | NVME_SC_DNR;
+ return 0;
+}
+
static u16 nvmet_bdev_discard_range(struct nvmet_ns *ns,
struct nvme_dsm_range *range, struct bio **bio)
{
diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
index 8c42b3a8c420..81a9dc5290a8 100644
--- a/drivers/nvme/target/io-cmd-file.c
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -16,6 +16,8 @@
void nvmet_file_ns_disable(struct nvmet_ns *ns)
{
if (ns->file) {
+ if (ns->buffered_io)
+ flush_workqueue(buffered_io_wq);
mempool_destroy(ns->bvec_pool);
ns->bvec_pool = NULL;
kmem_cache_destroy(ns->bvec_cache);
@@ -27,11 +29,14 @@ void nvmet_file_ns_disable(struct nvmet_ns *ns)
int nvmet_file_ns_enable(struct nvmet_ns *ns)
{
- int ret;
+ int flags = O_RDWR | O_LARGEFILE;
struct kstat stat;
+ int ret;
+
+ if (!ns->buffered_io)
+ flags |= O_DIRECT;
- ns->file = filp_open(ns->device_path,
- O_RDWR | O_LARGEFILE | O_DIRECT, 0);
+ ns->file = filp_open(ns->device_path, flags, 0);
if (IS_ERR(ns->file)) {
pr_err("failed to open file %s: (%ld)\n",
ns->device_path, PTR_ERR(ns->file));
@@ -100,7 +105,7 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,
iocb->ki_pos = pos;
iocb->ki_filp = req->ns->file;
- iocb->ki_flags = IOCB_DIRECT | ki_flags;
+ iocb->ki_flags = ki_flags | iocb_flags(req->ns->file);
ret = call_iter(iocb, &iter);
@@ -140,6 +145,12 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
return;
}
+ pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift;
+ if (unlikely(pos + req->data_len > req->ns->size)) {
+ nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR);
+ return;
+ }
+
if (nr_bvec > NVMET_MAX_INLINE_BIOVEC)
req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
GFP_KERNEL);
@@ -155,8 +166,6 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
is_sync = true;
}
- pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift;
-
memset(&req->f.iocb, 0, sizeof(struct kiocb));
for_each_sg_page(req->sg, &sg_pg_iter, req->sg_cnt, 0) {
nvmet_file_init_bvec(&req->f.bvec[bv_cnt], &sg_pg_iter);
@@ -189,14 +198,31 @@ out:
nvmet_file_submit_bvec(req, pos, bv_cnt, total_len);
}
-static void nvmet_file_flush_work(struct work_struct *w)
+static void nvmet_file_buffered_io_work(struct work_struct *w)
{
struct nvmet_req *req = container_of(w, struct nvmet_req, f.work);
- int ret;
- ret = vfs_fsync(req->ns->file, 1);
+ nvmet_file_execute_rw(req);
+}
- nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
+static void nvmet_file_execute_rw_buffered_io(struct nvmet_req *req)
+{
+ INIT_WORK(&req->f.work, nvmet_file_buffered_io_work);
+ queue_work(buffered_io_wq, &req->f.work);
+}
+
+u16 nvmet_file_flush(struct nvmet_req *req)
+{
+ if (vfs_fsync(req->ns->file, 1) < 0)
+ return NVME_SC_INTERNAL | NVME_SC_DNR;
+ return 0;
+}
+
+static void nvmet_file_flush_work(struct work_struct *w)
+{
+ struct nvmet_req *req = container_of(w, struct nvmet_req, f.work);
+
+ nvmet_req_complete(req, nvmet_file_flush(req));
}
static void nvmet_file_execute_flush(struct nvmet_req *req)
@@ -209,22 +235,30 @@ static void nvmet_file_execute_discard(struct nvmet_req *req)
{
int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
struct nvme_dsm_range range;
- loff_t offset;
- loff_t len;
- int i, ret;
+ loff_t offset, len;
+ u16 ret;
+ int i;
for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) {
- if (nvmet_copy_from_sgl(req, i * sizeof(range), &range,
- sizeof(range)))
+ ret = nvmet_copy_from_sgl(req, i * sizeof(range), &range,
+ sizeof(range));
+ if (ret)
break;
+
offset = le64_to_cpu(range.slba) << req->ns->blksize_shift;
len = le32_to_cpu(range.nlb) << req->ns->blksize_shift;
- ret = vfs_fallocate(req->ns->file, mode, offset, len);
- if (ret)
+ if (offset + len > req->ns->size) {
+ ret = NVME_SC_LBA_RANGE | NVME_SC_DNR;
break;
+ }
+
+ if (vfs_fallocate(req->ns->file, mode, offset, len)) {
+ ret = NVME_SC_INTERNAL | NVME_SC_DNR;
+ break;
+ }
}
- nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
+ nvmet_req_complete(req, ret);
}
static void nvmet_file_dsm_work(struct work_struct *w)
@@ -263,6 +297,11 @@ static void nvmet_file_write_zeroes_work(struct work_struct *w)
len = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) <<
req->ns->blksize_shift);
+ if (unlikely(offset + len > req->ns->size)) {
+ nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR);
+ return;
+ }
+
ret = vfs_fallocate(req->ns->file, mode, offset, len);
nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
}
@@ -280,7 +319,10 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req)
switch (cmd->common.opcode) {
case nvme_cmd_read:
case nvme_cmd_write:
- req->execute = nvmet_file_execute_rw;
+ if (req->ns->buffered_io)
+ req->execute = nvmet_file_execute_rw_buffered_io;
+ else
+ req->execute = nvmet_file_execute_rw;
req->data_len = nvmet_rw_len(req);
return 0;
case nvme_cmd_flush:
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index ae7586b8be07..9908082b32c4 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -227,6 +227,7 @@ static int nvme_loop_init_request(struct blk_mq_tag_set *set,
{
struct nvme_loop_ctrl *ctrl = set->driver_data;
+ nvme_req(req)->ctrl = &ctrl->ctrl;
return nvme_loop_init_iod(ctrl, blk_mq_rq_to_pdu(req),
(set == &ctrl->tag_set) ? hctx_idx + 1 : 0);
}
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 480dfe10fad9..ec9af4ee03b6 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -30,12 +30,11 @@
#define NVMET_ASYNC_EVENTS 4
#define NVMET_ERROR_LOG_SLOTS 128
-
/*
* Supported optional AENs:
*/
#define NVMET_AEN_CFG_OPTIONAL \
- NVME_AEN_CFG_NS_ATTR
+ (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_ANA_CHANGE)
/*
* Plus mandatory SMART AENs (we'll never send them, but allow enabling them):
@@ -59,12 +58,15 @@ struct nvmet_ns {
struct percpu_ref ref;
struct block_device *bdev;
struct file *file;
+ bool readonly;
u32 nsid;
u32 blksize_shift;
loff_t size;
u8 nguid[16];
uuid_t uuid;
+ u32 anagrpid;
+ bool buffered_io;
bool enabled;
struct nvmet_subsys *subsys;
const char *device_path;
@@ -97,6 +99,18 @@ struct nvmet_sq {
struct completion confirm_done;
};
+struct nvmet_ana_group {
+ struct config_group group;
+ struct nvmet_port *port;
+ u32 grpid;
+};
+
+static inline struct nvmet_ana_group *to_ana_group(struct config_item *item)
+{
+ return container_of(to_config_group(item), struct nvmet_ana_group,
+ group);
+}
+
/**
* struct nvmet_port - Common structure to keep port
* information for the target.
@@ -114,8 +128,12 @@ struct nvmet_port {
struct list_head subsystems;
struct config_group referrals_group;
struct list_head referrals;
+ struct config_group ana_groups_group;
+ struct nvmet_ana_group ana_default_group;
+ enum nvme_ana_state *ana_state;
void *priv;
bool enabled;
+ int inline_data_size;
};
static inline struct nvmet_port *to_nvmet_port(struct config_item *item)
@@ -124,6 +142,13 @@ static inline struct nvmet_port *to_nvmet_port(struct config_item *item)
group);
}
+static inline struct nvmet_port *ana_groups_to_port(
+ struct config_item *item)
+{
+ return container_of(to_config_group(item), struct nvmet_port,
+ ana_groups_group);
+}
+
struct nvmet_ctrl {
struct nvmet_subsys *subsys;
struct nvmet_cq **cqs;
@@ -138,6 +163,8 @@ struct nvmet_ctrl {
u16 cntlid;
u32 kato;
+ struct nvmet_port *port;
+
u32 aen_enabled;
unsigned long aen_masked;
struct nvmet_req *async_event_cmds[NVMET_ASYNC_EVENTS];
@@ -166,6 +193,7 @@ struct nvmet_subsys {
struct kref ref;
struct list_head namespaces;
+ unsigned int nr_namespaces;
unsigned int max_nsid;
struct list_head ctrls;
@@ -225,7 +253,6 @@ struct nvmet_req;
struct nvmet_fabrics_ops {
struct module *owner;
unsigned int type;
- unsigned int sqe_inline_size;
unsigned int msdbd;
bool has_keyed_sgls : 1;
void (*queue_response)(struct nvmet_req *req);
@@ -269,6 +296,8 @@ struct nvmet_req {
const struct nvmet_fabrics_ops *ops;
};
+extern struct workqueue_struct *buffered_io_wq;
+
static inline void nvmet_set_status(struct nvmet_req *req, u16 status)
{
req->rsp->status = cpu_to_le16(status << 1);
@@ -337,6 +366,10 @@ void nvmet_ns_disable(struct nvmet_ns *ns);
struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid);
void nvmet_ns_free(struct nvmet_ns *ns);
+void nvmet_send_ana_event(struct nvmet_subsys *subsys,
+ struct nvmet_port *port);
+void nvmet_port_send_ana_event(struct nvmet_port *port);
+
int nvmet_register_transport(const struct nvmet_fabrics_ops *ops);
void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops);
@@ -357,6 +390,22 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd);
#define NVMET_QUEUE_SIZE 1024
#define NVMET_NR_QUEUES 128
#define NVMET_MAX_CMD NVMET_QUEUE_SIZE
+
+/*
+ * Nice round number that makes a list of nsids fit into a page.
+ * Should become tunable at some point in the future.
+ */
+#define NVMET_MAX_NAMESPACES 1024
+
+/*
+ * 0 is not a valid ANA group ID, so we start numbering at 1.
+ *
+ * ANA Group 1 exists without manual intervention, has namespaces assigned to it
+ * by default, and is available in an optimized state through all ports.
+ */
+#define NVMET_MAX_ANAGRPS 128
+#define NVMET_DEFAULT_ANA_GRPID 1
+
#define NVMET_KAS 10
#define NVMET_DISC_KATO 120
@@ -370,6 +419,10 @@ extern struct nvmet_subsys *nvmet_disc_subsys;
extern u64 nvmet_genctr;
extern struct rw_semaphore nvmet_config_sem;
+extern u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
+extern u64 nvmet_ana_chgcnt;
+extern struct rw_semaphore nvmet_ana_sem;
+
bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
const char *hostnqn);
@@ -377,6 +430,9 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns);
int nvmet_file_ns_enable(struct nvmet_ns *ns);
void nvmet_bdev_ns_disable(struct nvmet_ns *ns);
void nvmet_file_ns_disable(struct nvmet_ns *ns);
+u16 nvmet_bdev_flush(struct nvmet_req *req);
+u16 nvmet_file_flush(struct nvmet_req *req);
+void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid);
static inline u32 nvmet_rw_len(struct nvmet_req *req)
{
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 52e0c5d579a7..e7f43d1e1779 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -33,16 +33,17 @@
#include "nvmet.h"
/*
- * We allow up to a page of inline data to go with the SQE
+ * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data
*/
-#define NVMET_RDMA_INLINE_DATA_SIZE PAGE_SIZE
+#define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE PAGE_SIZE
+#define NVMET_RDMA_MAX_INLINE_SGE 4
+#define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE)
struct nvmet_rdma_cmd {
- struct ib_sge sge[2];
+ struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1];
struct ib_cqe cqe;
struct ib_recv_wr wr;
- struct scatterlist inline_sg;
- struct page *inline_page;
+ struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE];
struct nvme_command *nvme_cmd;
struct nvmet_rdma_queue *queue;
};
@@ -116,6 +117,8 @@ struct nvmet_rdma_device {
size_t srq_size;
struct kref ref;
struct list_head entry;
+ int inline_data_size;
+ int inline_page_count;
};
static bool nvmet_rdma_use_srq;
@@ -138,6 +141,11 @@ static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue);
static const struct nvmet_fabrics_ops nvmet_rdma_ops;
+static int num_pages(int len)
+{
+ return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT);
+}
+
/* XXX: really should move to a generic header sooner or later.. */
static inline u32 get_unaligned_le24(const u8 *p)
{
@@ -184,6 +192,71 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
}
+static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev,
+ struct nvmet_rdma_cmd *c)
+{
+ struct scatterlist *sg;
+ struct ib_sge *sge;
+ int i;
+
+ if (!ndev->inline_data_size)
+ return;
+
+ sg = c->inline_sg;
+ sge = &c->sge[1];
+
+ for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
+ if (sge->length)
+ ib_dma_unmap_page(ndev->device, sge->addr,
+ sge->length, DMA_FROM_DEVICE);
+ if (sg_page(sg))
+ __free_page(sg_page(sg));
+ }
+}
+
+static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev,
+ struct nvmet_rdma_cmd *c)
+{
+ struct scatterlist *sg;
+ struct ib_sge *sge;
+ struct page *pg;
+ int len;
+ int i;
+
+ if (!ndev->inline_data_size)
+ return 0;
+
+ sg = c->inline_sg;
+ sg_init_table(sg, ndev->inline_page_count);
+ sge = &c->sge[1];
+ len = ndev->inline_data_size;
+
+ for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
+ pg = alloc_page(GFP_KERNEL);
+ if (!pg)
+ goto out_err;
+ sg_assign_page(sg, pg);
+ sge->addr = ib_dma_map_page(ndev->device,
+ pg, 0, PAGE_SIZE, DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(ndev->device, sge->addr))
+ goto out_err;
+ sge->length = min_t(int, len, PAGE_SIZE);
+ sge->lkey = ndev->pd->local_dma_lkey;
+ len -= sge->length;
+ }
+
+ return 0;
+out_err:
+ for (; i >= 0; i--, sg--, sge--) {
+ if (sge->length)
+ ib_dma_unmap_page(ndev->device, sge->addr,
+ sge->length, DMA_FROM_DEVICE);
+ if (sg_page(sg))
+ __free_page(sg_page(sg));
+ }
+ return -ENOMEM;
+}
+
static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
struct nvmet_rdma_cmd *c, bool admin)
{
@@ -200,33 +273,17 @@ static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
c->sge[0].length = sizeof(*c->nvme_cmd);
c->sge[0].lkey = ndev->pd->local_dma_lkey;
- if (!admin) {
- c->inline_page = alloc_pages(GFP_KERNEL,
- get_order(NVMET_RDMA_INLINE_DATA_SIZE));
- if (!c->inline_page)
- goto out_unmap_cmd;
- c->sge[1].addr = ib_dma_map_page(ndev->device,
- c->inline_page, 0, NVMET_RDMA_INLINE_DATA_SIZE,
- DMA_FROM_DEVICE);
- if (ib_dma_mapping_error(ndev->device, c->sge[1].addr))
- goto out_free_inline_page;
- c->sge[1].length = NVMET_RDMA_INLINE_DATA_SIZE;
- c->sge[1].lkey = ndev->pd->local_dma_lkey;
- }
+ if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c))
+ goto out_unmap_cmd;
c->cqe.done = nvmet_rdma_recv_done;
c->wr.wr_cqe = &c->cqe;
c->wr.sg_list = c->sge;
- c->wr.num_sge = admin ? 1 : 2;
+ c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1;
return 0;
-out_free_inline_page:
- if (!admin) {
- __free_pages(c->inline_page,
- get_order(NVMET_RDMA_INLINE_DATA_SIZE));
- }
out_unmap_cmd:
ib_dma_unmap_single(ndev->device, c->sge[0].addr,
sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
@@ -240,12 +297,8 @@ out:
static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev,
struct nvmet_rdma_cmd *c, bool admin)
{
- if (!admin) {
- ib_dma_unmap_page(ndev->device, c->sge[1].addr,
- NVMET_RDMA_INLINE_DATA_SIZE, DMA_FROM_DEVICE);
- __free_pages(c->inline_page,
- get_order(NVMET_RDMA_INLINE_DATA_SIZE));
- }
+ if (!admin)
+ nvmet_rdma_free_inline_pages(ndev, c);
ib_dma_unmap_single(ndev->device, c->sge[0].addr,
sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
kfree(c->nvme_cmd);
@@ -383,14 +436,21 @@ static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev,
struct nvmet_rdma_cmd *cmd)
{
struct ib_recv_wr *bad_wr;
+ int ret;
ib_dma_sync_single_for_device(ndev->device,
cmd->sge[0].addr, cmd->sge[0].length,
DMA_FROM_DEVICE);
if (ndev->srq)
- return ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr);
- return ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr);
+ ret = ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr);
+ else
+ ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr);
+
+ if (unlikely(ret))
+ pr_err("post_recv cmd failed\n");
+
+ return ret;
}
static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue)
@@ -429,7 +489,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
}
- if (rsp->req.sg != &rsp->cmd->inline_sg)
+ if (rsp->req.sg != rsp->cmd->inline_sg)
sgl_free(rsp->req.sg);
if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
@@ -493,7 +553,7 @@ static void nvmet_rdma_queue_response(struct nvmet_req *req)
rsp->send_sge.addr, rsp->send_sge.length,
DMA_TO_DEVICE);
- if (ib_post_send(cm_id->qp, first_wr, &bad_wr)) {
+ if (unlikely(ib_post_send(cm_id->qp, first_wr, &bad_wr))) {
pr_err("sending cmd response failed\n");
nvmet_rdma_release_rsp(rsp);
}
@@ -529,10 +589,25 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
u64 off)
{
- sg_init_table(&rsp->cmd->inline_sg, 1);
- sg_set_page(&rsp->cmd->inline_sg, rsp->cmd->inline_page, len, off);
- rsp->req.sg = &rsp->cmd->inline_sg;
- rsp->req.sg_cnt = 1;
+ int sg_count = num_pages(len);
+ struct scatterlist *sg;
+ int i;
+
+ sg = rsp->cmd->inline_sg;
+ for (i = 0; i < sg_count; i++, sg++) {
+ if (i < sg_count - 1)
+ sg_unmark_end(sg);
+ else
+ sg_mark_end(sg);
+ sg->offset = off;
+ sg->length = min_t(int, len, PAGE_SIZE - off);
+ len -= sg->length;
+ if (!i)
+ off = 0;
+ }
+
+ rsp->req.sg = rsp->cmd->inline_sg;
+ rsp->req.sg_cnt = sg_count;
}
static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
@@ -544,7 +619,7 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
if (!nvme_is_write(rsp->req.cmd))
return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
- if (off + len > NVMET_RDMA_INLINE_DATA_SIZE) {
+ if (off + len > rsp->queue->dev->inline_data_size) {
pr_err("invalid inline data offset!\n");
return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
}
@@ -743,7 +818,7 @@ static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
srq_size = 4095; /* XXX: tune */
srq_attr.attr.max_wr = srq_size;
- srq_attr.attr.max_sge = 2;
+ srq_attr.attr.max_sge = 1 + ndev->inline_page_count;
srq_attr.attr.srq_limit = 0;
srq_attr.srq_type = IB_SRQT_BASIC;
srq = ib_create_srq(ndev->pd, &srq_attr);
@@ -765,11 +840,16 @@ static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
ndev->srq = srq;
ndev->srq_size = srq_size;
- for (i = 0; i < srq_size; i++)
- nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]);
+ for (i = 0; i < srq_size; i++) {
+ ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]);
+ if (ret)
+ goto out_free_cmds;
+ }
return 0;
+out_free_cmds:
+ nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
out_destroy_srq:
ib_destroy_srq(srq);
return ret;
@@ -793,7 +873,10 @@ static void nvmet_rdma_free_dev(struct kref *ref)
static struct nvmet_rdma_device *
nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
{
+ struct nvmet_port *port = cm_id->context;
struct nvmet_rdma_device *ndev;
+ int inline_page_count;
+ int inline_sge_count;
int ret;
mutex_lock(&device_list_mutex);
@@ -807,6 +890,18 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
if (!ndev)
goto out_err;
+ inline_page_count = num_pages(port->inline_data_size);
+ inline_sge_count = max(cm_id->device->attrs.max_sge_rd,
+ cm_id->device->attrs.max_sge) - 1;
+ if (inline_page_count > inline_sge_count) {
+ pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n",
+ port->inline_data_size, cm_id->device->name,
+ inline_sge_count * PAGE_SIZE);
+ port->inline_data_size = inline_sge_count * PAGE_SIZE;
+ inline_page_count = inline_sge_count;
+ }
+ ndev->inline_data_size = port->inline_data_size;
+ ndev->inline_page_count = inline_page_count;
ndev->device = cm_id->device;
kref_init(&ndev->ref);
@@ -881,7 +976,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
} else {
/* +1 for drain */
qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size;
- qp_attr.cap.max_recv_sge = 2;
+ qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count;
}
ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr);
@@ -899,13 +994,17 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
if (!ndev->srq) {
for (i = 0; i < queue->recv_queue_size; i++) {
queue->cmds[i].queue = queue;
- nvmet_rdma_post_recv(ndev, &queue->cmds[i]);
+ ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]);
+ if (ret)
+ goto err_destroy_qp;
}
}
out:
return ret;
+err_destroy_qp:
+ rdma_destroy_qp(queue->cm_id);
err_destroy_cq:
ib_free_cq(queue->cq);
goto out;
@@ -1379,6 +1478,15 @@ static int nvmet_rdma_add_port(struct nvmet_port *port)
return -EINVAL;
}
+ if (port->inline_data_size < 0) {
+ port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE;
+ } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) {
+ pr_warn("inline_data_size %u is too large, reducing to %u\n",
+ port->inline_data_size,
+ NVMET_RDMA_MAX_INLINE_DATA_SIZE);
+ port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE;
+ }
+
ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
port->disc_addr.trsvcid, &addr);
if (ret) {
@@ -1456,7 +1564,6 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
.owner = THIS_MODULE,
.type = NVMF_TRTYPE_RDMA,
- .sqe_inline_size = NVMET_RDMA_INLINE_DATA_SIZE,
.msdbd = 1,
.has_keyed_sgls = 1,
.add_port = nvmet_rdma_add_port,
diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
index 80aca2456353..768953881c9e 100644
--- a/drivers/scsi/Makefile
+++ b/drivers/scsi/Makefile
@@ -21,6 +21,7 @@ CFLAGS_gdth.o = # -DDEBUG_GDTH=2 -D__SERIAL__ -D__COM2__ -DGDTH_STATISTICS
obj-$(CONFIG_PCMCIA) += pcmcia/
obj-$(CONFIG_SCSI) += scsi_mod.o
+obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_common.o
obj-$(CONFIG_RAID_ATTRS) += raid_class.o
@@ -156,7 +157,6 @@ obj-$(CONFIG_SCSI_HISI_SAS) += hisi_sas/
obj-$(CONFIG_SCSI_DEBUG) += scsi_debug.o
scsi_mod-y += scsi.o hosts.o scsi_ioctl.o \
scsicam.o scsi_error.o scsi_lib.o
-scsi_mod-y += scsi_common.o
scsi_mod-$(CONFIG_SCSI_CONSTANTS) += constants.o
scsi_mod-$(CONFIG_SCSI_DMA) += scsi_lib_dma.o
scsi_mod-y += scsi_scan.o scsi_sysfs.o scsi_devinfo.o
diff --git a/drivers/scsi/cxlflash/superpipe.c b/drivers/scsi/cxlflash/superpipe.c
index e489d89cbb45..379890c4500b 100644
--- a/drivers/scsi/cxlflash/superpipe.c
+++ b/drivers/scsi/cxlflash/superpipe.c
@@ -339,7 +339,6 @@ static int read_cap16(struct scsi_device *sdev, struct llun_info *lli)
struct scsi_sense_hdr sshdr;
u8 *cmd_buf = NULL;
u8 *scsi_cmd = NULL;
- u8 *sense_buf = NULL;
int rc = 0;
int result = 0;
int retry_cnt = 0;
@@ -348,8 +347,7 @@ static int read_cap16(struct scsi_device *sdev, struct llun_info *lli)
retry:
cmd_buf = kzalloc(CMD_BUFSIZE, GFP_KERNEL);
scsi_cmd = kzalloc(MAX_COMMAND_SIZE, GFP_KERNEL);
- sense_buf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL);
- if (unlikely(!cmd_buf || !scsi_cmd || !sense_buf)) {
+ if (unlikely(!cmd_buf || !scsi_cmd)) {
rc = -ENOMEM;
goto out;
}
@@ -364,7 +362,7 @@ retry:
/* Drop the ioctl read semahpore across lengthy call */
up_read(&cfg->ioctl_rwsem);
result = scsi_execute(sdev, scsi_cmd, DMA_FROM_DEVICE, cmd_buf,
- CMD_BUFSIZE, sense_buf, &sshdr, to, CMD_RETRIES,
+ CMD_BUFSIZE, NULL, &sshdr, to, CMD_RETRIES,
0, 0, NULL);
down_read(&cfg->ioctl_rwsem);
rc = check_state(cfg);
@@ -395,7 +393,6 @@ retry:
if (retry_cnt++ < 1) {
kfree(cmd_buf);
kfree(scsi_cmd);
- kfree(sense_buf);
goto retry;
}
}
@@ -426,7 +423,6 @@ retry:
out:
kfree(cmd_buf);
kfree(scsi_cmd);
- kfree(sense_buf);
dev_dbg(dev, "%s: maxlba=%lld blklen=%d rc=%d\n",
__func__, gli->max_lba, gli->blk_len, rc);
diff --git a/drivers/scsi/cxlflash/vlun.c b/drivers/scsi/cxlflash/vlun.c
index 66e445a17d6c..2c904bf16b65 100644
--- a/drivers/scsi/cxlflash/vlun.c
+++ b/drivers/scsi/cxlflash/vlun.c
@@ -426,7 +426,6 @@ static int write_same16(struct scsi_device *sdev,
{
u8 *cmd_buf = NULL;
u8 *scsi_cmd = NULL;
- u8 *sense_buf = NULL;
int rc = 0;
int result = 0;
u64 offset = lba;
@@ -440,8 +439,7 @@ static int write_same16(struct scsi_device *sdev,
cmd_buf = kzalloc(CMD_BUFSIZE, GFP_KERNEL);
scsi_cmd = kzalloc(MAX_COMMAND_SIZE, GFP_KERNEL);
- sense_buf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL);
- if (unlikely(!cmd_buf || !scsi_cmd || !sense_buf)) {
+ if (unlikely(!cmd_buf || !scsi_cmd)) {
rc = -ENOMEM;
goto out;
}
@@ -457,7 +455,7 @@ static int write_same16(struct scsi_device *sdev,
/* Drop the ioctl read semahpore across lengthy call */
up_read(&cfg->ioctl_rwsem);
result = scsi_execute(sdev, scsi_cmd, DMA_TO_DEVICE, cmd_buf,
- CMD_BUFSIZE, sense_buf, NULL, to,
+ CMD_BUFSIZE, NULL, NULL, to,
CMD_RETRIES, 0, 0, NULL);
down_read(&cfg->ioctl_rwsem);
rc = check_state(cfg);
@@ -482,7 +480,6 @@ static int write_same16(struct scsi_device *sdev,
out:
kfree(cmd_buf);
kfree(scsi_cmd);
- kfree(sense_buf);
dev_dbg(dev, "%s: returning rc=%d\n", __func__, rc);
return rc;
}
diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
index b8d131a455d0..dd738ae5c75b 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
@@ -4568,7 +4568,7 @@ _scsih_setup_eedp(struct MPT3SAS_ADAPTER *ioc, struct scsi_cmnd *scmd,
MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG |
MPI2_SCSIIO_EEDPFLAGS_CHECK_GUARD;
mpi_request->CDB.EEDP32.PrimaryReferenceTag =
- cpu_to_be32(scsi_prot_ref_tag(scmd));
+ cpu_to_be32(t10_pi_ref_tag(scmd->request));
break;
case SCSI_PROT_DIF_TYPE3:
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 41e9ac9fc138..9cb9a166fa0c 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -238,7 +238,7 @@ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason)
/**
- * scsi_execute - insert request and wait for the result
+ * __scsi_execute - insert request and wait for the result
* @sdev: scsi device
* @cmd: scsi command
* @data_direction: data direction
@@ -255,7 +255,7 @@ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason)
* Returns the scsi_cmnd result field if a command was executed, or a negative
* Linux error code if we didn't get that far.
*/
-int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
+int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
int data_direction, void *buffer, unsigned bufflen,
unsigned char *sense, struct scsi_sense_hdr *sshdr,
int timeout, int retries, u64 flags, req_flags_t rq_flags,
@@ -309,7 +309,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
return ret;
}
-EXPORT_SYMBOL(scsi_execute);
+EXPORT_SYMBOL(__scsi_execute);
/*
* Function: scsi_init_cmd_errh()
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 9421d9877730..bbebdc3769b0 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1119,7 +1119,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
SCpnt->cmnd[0] = WRITE_6;
if (blk_integrity_rq(rq))
- sd_dif_prepare(SCpnt);
+ t10_pi_prepare(SCpnt->request, sdkp->protection_type);
} else if (rq_data_dir(rq) == READ) {
SCpnt->cmnd[0] = READ_6;
@@ -2047,8 +2047,10 @@ static int sd_done(struct scsi_cmnd *SCpnt)
"sd_done: completed %d of %d bytes\n",
good_bytes, scsi_bufflen(SCpnt)));
- if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt))
- sd_dif_complete(SCpnt, good_bytes);
+ if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt) &&
+ good_bytes)
+ t10_pi_complete(SCpnt->request, sdkp->protection_type,
+ good_bytes / scsi_prot_interval(SCpnt));
return good_bytes;
}
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 392c7d078ae3..a7d4f50b67d4 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -254,21 +254,12 @@ static inline unsigned int sd_prot_flag_mask(unsigned int prot_op)
#ifdef CONFIG_BLK_DEV_INTEGRITY
extern void sd_dif_config_host(struct scsi_disk *);
-extern void sd_dif_prepare(struct scsi_cmnd *scmd);
-extern void sd_dif_complete(struct scsi_cmnd *, unsigned int);
#else /* CONFIG_BLK_DEV_INTEGRITY */
static inline void sd_dif_config_host(struct scsi_disk *disk)
{
}
-static inline int sd_dif_prepare(struct scsi_cmnd *scmd)
-{
- return 0;
-}
-static inline void sd_dif_complete(struct scsi_cmnd *cmd, unsigned int a)
-{
-}
#endif /* CONFIG_BLK_DEV_INTEGRITY */
diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c
index 9035380c0dda..db72c82486e3 100644
--- a/drivers/scsi/sd_dif.c
+++ b/drivers/scsi/sd_dif.c
@@ -95,116 +95,3 @@ out:
blk_integrity_register(disk, &bi);
}
-/*
- * The virtual start sector is the one that was originally submitted
- * by the block layer. Due to partitioning, MD/DM cloning, etc. the
- * actual physical start sector is likely to be different. Remap
- * protection information to match the physical LBA.
- *
- * From a protocol perspective there's a slight difference between
- * Type 1 and 2. The latter uses 32-byte CDBs exclusively, and the
- * reference tag is seeded in the CDB. This gives us the potential to
- * avoid virt->phys remapping during write. However, at read time we
- * don't know whether the virt sector is the same as when we wrote it
- * (we could be reading from real disk as opposed to MD/DM device. So
- * we always remap Type 2 making it identical to Type 1.
- *
- * Type 3 does not have a reference tag so no remapping is required.
- */
-void sd_dif_prepare(struct scsi_cmnd *scmd)
-{
- const int tuple_sz = sizeof(struct t10_pi_tuple);
- struct bio *bio;
- struct scsi_disk *sdkp;
- struct t10_pi_tuple *pi;
- u32 phys, virt;
-
- sdkp = scsi_disk(scmd->request->rq_disk);
-
- if (sdkp->protection_type == T10_PI_TYPE3_PROTECTION)
- return;
-
- phys = scsi_prot_ref_tag(scmd);
-
- __rq_for_each_bio(bio, scmd->request) {
- struct bio_integrity_payload *bip = bio_integrity(bio);
- struct bio_vec iv;
- struct bvec_iter iter;
- unsigned int j;
-
- /* Already remapped? */
- if (bip->bip_flags & BIP_MAPPED_INTEGRITY)
- break;
-
- virt = bip_get_seed(bip) & 0xffffffff;
-
- bip_for_each_vec(iv, bip, iter) {
- pi = kmap_atomic(iv.bv_page) + iv.bv_offset;
-
- for (j = 0; j < iv.bv_len; j += tuple_sz, pi++) {
-
- if (be32_to_cpu(pi->ref_tag) == virt)
- pi->ref_tag = cpu_to_be32(phys);
-
- virt++;
- phys++;
- }
-
- kunmap_atomic(pi);
- }
-
- bip->bip_flags |= BIP_MAPPED_INTEGRITY;
- }
-}
-
-/*
- * Remap physical sector values in the reference tag to the virtual
- * values expected by the block layer.
- */
-void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes)
-{
- const int tuple_sz = sizeof(struct t10_pi_tuple);
- struct scsi_disk *sdkp;
- struct bio *bio;
- struct t10_pi_tuple *pi;
- unsigned int j, intervals;
- u32 phys, virt;
-
- sdkp = scsi_disk(scmd->request->rq_disk);
-
- if (sdkp->protection_type == T10_PI_TYPE3_PROTECTION || good_bytes == 0)
- return;
-
- intervals = good_bytes / scsi_prot_interval(scmd);
- phys = scsi_prot_ref_tag(scmd);
-
- __rq_for_each_bio(bio, scmd->request) {
- struct bio_integrity_payload *bip = bio_integrity(bio);
- struct bio_vec iv;
- struct bvec_iter iter;
-
- virt = bip_get_seed(bip) & 0xffffffff;
-
- bip_for_each_vec(iv, bip, iter) {
- pi = kmap_atomic(iv.bv_page) + iv.bv_offset;
-
- for (j = 0; j < iv.bv_len; j += tuple_sz, pi++) {
-
- if (intervals == 0) {
- kunmap_atomic(pi);
- return;
- }
-
- if (be32_to_cpu(pi->ref_tag) == phys)
- pi->ref_tag = cpu_to_be32(virt);
-
- virt++;
- phys++;
- intervals--;
- }
-
- kunmap_atomic(pi);
- }
- }
-}
-
diff --git a/drivers/scsi/sr_ioctl.c b/drivers/scsi/sr_ioctl.c
index 35fab1e18adc..ffcf902da390 100644
--- a/drivers/scsi/sr_ioctl.c
+++ b/drivers/scsi/sr_ioctl.c
@@ -186,14 +186,13 @@ static int sr_play_trkind(struct cdrom_device_info *cdi,
int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc)
{
struct scsi_device *SDev;
- struct scsi_sense_hdr sshdr;
+ struct scsi_sense_hdr local_sshdr, *sshdr = &local_sshdr;
int result, err = 0, retries = 0;
- unsigned char sense_buffer[SCSI_SENSE_BUFFERSIZE], *senseptr = NULL;
SDev = cd->device;
- if (cgc->sense)
- senseptr = sense_buffer;
+ if (cgc->sshdr)
+ sshdr = cgc->sshdr;
retry:
if (!scsi_block_when_processing_errors(SDev)) {
@@ -202,15 +201,12 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc)
}
result = scsi_execute(SDev, cgc->cmd, cgc->data_direction,
- cgc->buffer, cgc->buflen, senseptr, &sshdr,
+ cgc->buffer, cgc->buflen, NULL, sshdr,
cgc->timeout, IOCTL_RETRIES, 0, 0, NULL);
- if (cgc->sense)
- memcpy(cgc->sense, sense_buffer, sizeof(*cgc->sense));
-
/* Minimal error checking. Ignore cases we know about, and report the rest. */
if (driver_byte(result) != 0) {
- switch (sshdr.sense_key) {
+ switch (sshdr->sense_key) {
case UNIT_ATTENTION:
SDev->changed = 1;
if (!cgc->quiet)
@@ -221,8 +217,8 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc)
err = -ENOMEDIUM;
break;
case NOT_READY: /* This happens if there is no disc in drive */
- if (sshdr.asc == 0x04 &&
- sshdr.ascq == 0x01) {
+ if (sshdr->asc == 0x04 &&
+ sshdr->ascq == 0x01) {
/* sense: Logical unit is in process of becoming ready */
if (!cgc->quiet)
sr_printk(KERN_INFO, cd,
@@ -245,8 +241,8 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc)
break;
case ILLEGAL_REQUEST:
err = -EIO;
- if (sshdr.asc == 0x20 &&
- sshdr.ascq == 0x00)
+ if (sshdr->asc == 0x20 &&
+ sshdr->ascq == 0x00)
/* sense: Invalid command operation code */
err = -EDRIVE_CANT_DO_THIS;
break;
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 6dc8891ccb74..1c72db94270e 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -513,12 +513,12 @@ static void virtio_scsi_init_hdr_pi(struct virtio_device *vdev,
if (sc->sc_data_direction == DMA_TO_DEVICE)
cmd_pi->pi_bytesout = cpu_to_virtio32(vdev,
- blk_rq_sectors(rq) *
- bi->tuple_size);
+ bio_integrity_bytes(bi,
+ blk_rq_sectors(rq)));
else if (sc->sc_data_direction == DMA_FROM_DEVICE)
cmd_pi->pi_bytesin = cpu_to_virtio32(vdev,
- blk_rq_sectors(rq) *
- bi->tuple_size);
+ bio_integrity_bytes(bi,
+ blk_rq_sectors(rq)));
}
#endif
diff --git a/drivers/target/Kconfig b/drivers/target/Kconfig
index 4c44d7bed01a..cb6f32ce7de8 100644
--- a/drivers/target/Kconfig
+++ b/drivers/target/Kconfig
@@ -1,10 +1,10 @@
menuconfig TARGET_CORE
tristate "Generic Target Core Mod (TCM) and ConfigFS Infrastructure"
- depends on SCSI && BLOCK
+ depends on BLOCK
select CONFIGFS_FS
select CRC_T10DIF
- select BLK_SCSI_REQUEST # only for scsi_command_size_tbl..
+ select BLK_SCSI_REQUEST
select SGL_ALLOC
default n
help
@@ -29,6 +29,7 @@ config TCM_FILEIO
config TCM_PSCSI
tristate "TCM/pSCSI Subsystem Plugin for Linux/SCSI"
+ depends on SCSI
help
Say Y here to enable the TCM/pSCSI subsystem plugin for non-buffered
passthrough access to Linux/SCSI device
diff --git a/drivers/target/loopback/Kconfig b/drivers/target/loopback/Kconfig
index abe8ecbcdf06..158ee9d522f7 100644
--- a/drivers/target/loopback/Kconfig
+++ b/drivers/target/loopback/Kconfig
@@ -1,5 +1,6 @@
config LOOPBACK_TARGET
tristate "TCM Virtual SAS target and Linux/SCSI LDD fabric loopback module"
+ depends on SCSI
help
Say Y here to enable the TCM Virtual SAS target and Linux/SCSI LLD
fabric loopback module.
diff --git a/fs/block_dev.c b/fs/block_dev.c
index aba25414231a..38b8ce05cbc7 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -666,7 +666,8 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
result = blk_queue_enter(bdev->bd_queue, 0);
if (result)
return result;
- result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false);
+ result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
+ REQ_OP_READ);
blk_queue_exit(bdev->bd_queue);
return result;
}
@@ -704,7 +705,8 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
return result;
set_page_writeback(page);
- result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, true);
+ result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
+ REQ_OP_WRITE);
if (result) {
end_page_writeback(page);
} else {
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 1b8b44637e70..5331a15a61f1 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -873,8 +873,8 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
struct bio *bio;
if (per_dev != master_dev) {
- bio = bio_clone_kmalloc(master_dev->bio,
- GFP_KERNEL);
+ bio = bio_clone_fast(master_dev->bio,
+ GFP_KERNEL, NULL);
if (unlikely(!bio)) {
ORE_DBGMSG(
"Failed to allocate BIO size=%u\n",
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f7750bc5b85a..5863fd22e90b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3529,7 +3529,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_sb_block = sb_block;
if (sb->s_bdev->bd_part)
sbi->s_sectors_written_start =
- part_stat_read(sb->s_bdev->bd_part, sectors[1]);
+ part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]);
/* Cleanup superblock name */
strreplace(sb->s_id, '/', '!');
@@ -4838,7 +4838,8 @@ static int ext4_commit_super(struct super_block *sb, int sync)
if (sb->s_bdev->bd_part)
es->s_kbytes_written =
cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
- ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ ((part_stat_read(sb->s_bdev->bd_part,
+ sectors[STAT_WRITE]) -
EXT4_SB(sb)->s_sectors_written_start) >> 1));
else
es->s_kbytes_written =
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index e60cc5e89023..9212a026a1f1 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -58,7 +58,8 @@ static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
if (!sb->s_bdev->bd_part)
return snprintf(buf, PAGE_SIZE, "0\n");
return snprintf(buf, PAGE_SIZE, "%lu\n",
- (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ (part_stat_read(sb->s_bdev->bd_part,
+ sectors[STAT_WRITE]) -
sbi->s_sectors_written_start) >> 1);
}
@@ -70,7 +71,8 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
return snprintf(buf, PAGE_SIZE, "0\n");
return snprintf(buf, PAGE_SIZE, "%llu\n",
(unsigned long long)(sbi->s_kbytes_written +
- ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ ((part_stat_read(sb->s_bdev->bd_part,
+ sectors[STAT_WRITE]) -
EXT4_SB(sb)->s_sectors_written_start) >> 1)));
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4d8b1de83143..6799c3fc44e3 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1304,7 +1304,7 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
* and the return value is in kbytes. s is of struct f2fs_sb_info.
*/
#define BD_PART_WRITTEN(s) \
-(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) - \
+(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[STAT_WRITE]) - \
(s)->sectors_written_start) >> 1)
static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3995e926ba3a..17bcff789c08 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2882,7 +2882,8 @@ try_onemore:
/* For write statistics */
if (sb->s_bdev->bd_part)
sbi->sectors_written_start =
- (u64)part_stat_read(sb->s_bdev->bd_part, sectors[1]);
+ (u64)part_stat_read(sb->s_bdev->bd_part,
+ sectors[STAT_WRITE]);
/* Read accumulated write IO statistics if exists */
seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
diff --git a/fs/mpage.c b/fs/mpage.c
index b7e7f570733a..b73638db9866 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -51,8 +51,8 @@ static void mpage_end_io(struct bio *bio)
bio_for_each_segment_all(bv, bio, i) {
struct page *page = bv->bv_page;
- page_endio(page, op_is_write(bio_op(bio)),
- blk_status_to_errno(bio->bi_status));
+ page_endio(page, bio_op(bio),
+ blk_status_to_errno(bio->bi_status));
}
bio_put(bio);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index f08f5fe7bd08..51371740d2a8 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -429,7 +429,6 @@ extern void bio_put(struct bio *);
extern void __bio_clone_fast(struct bio *, struct bio *);
extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
-extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
extern struct bio_set fs_bio_set;
@@ -443,12 +442,6 @@ static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
}
-static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
-{
- return bio_clone_bioset(bio, gfp_mask, NULL);
-
-}
-
extern blk_qc_t submit_bio(struct bio *);
extern void bio_endio(struct bio *);
@@ -496,9 +489,9 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
extern void bio_set_pages_dirty(struct bio *bio);
extern void bio_check_pages_dirty(struct bio *bio);
-void generic_start_io_acct(struct request_queue *q, int rw,
+void generic_start_io_acct(struct request_queue *q, int op,
unsigned long sectors, struct hd_struct *part);
-void generic_end_io_acct(struct request_queue *q, int rw,
+void generic_end_io_acct(struct request_queue *q, int op,
struct hd_struct *part,
unsigned long start_time);
@@ -553,8 +546,16 @@ do { \
#define bio_dev(bio) \
disk_devt((bio)->bi_disk)
+#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+int bio_associate_blkcg_from_page(struct bio *bio, struct page *page);
+#else
+static inline int bio_associate_blkcg_from_page(struct bio *bio,
+ struct page *page) { return 0; }
+#endif
+
#ifdef CONFIG_BLK_CGROUP
int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
+int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg);
void bio_disassociate_task(struct bio *bio);
void bio_clone_blkcg_association(struct bio *dst, struct bio *src);
#else /* CONFIG_BLK_CGROUP */
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 6c666fd7de3c..34aec30e06c7 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -35,6 +35,7 @@ enum blkg_rwstat_type {
BLKG_RWSTAT_WRITE,
BLKG_RWSTAT_SYNC,
BLKG_RWSTAT_ASYNC,
+ BLKG_RWSTAT_DISCARD,
BLKG_RWSTAT_NR,
BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
@@ -136,6 +137,12 @@ struct blkcg_gq {
struct blkg_policy_data *pd[BLKCG_MAX_POLS];
struct rcu_head rcu_head;
+
+ atomic_t use_delay;
+ atomic64_t delay_nsec;
+ atomic64_t delay_start;
+ u64 last_delay;
+ int last_use;
};
typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
@@ -148,6 +155,8 @@ typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
+typedef size_t (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, char *buf,
+ size_t size);
struct blkcg_policy {
int plid;
@@ -167,6 +176,7 @@ struct blkcg_policy {
blkcg_pol_offline_pd_fn *pd_offline_fn;
blkcg_pol_free_pd_fn *pd_free_fn;
blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn;
+ blkcg_pol_stat_pd_fn *pd_stat_fn;
};
extern struct blkcg blkcg_root;
@@ -238,6 +248,42 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
return css_to_blkcg(task_css(current, io_cgrp_id));
}
+static inline bool blk_cgroup_congested(void)
+{
+ struct cgroup_subsys_state *css;
+ bool ret = false;
+
+ rcu_read_lock();
+ css = kthread_blkcg();
+ if (!css)
+ css = task_css(current, io_cgrp_id);
+ while (css) {
+ if (atomic_read(&css->cgroup->congestion_count)) {
+ ret = true;
+ break;
+ }
+ css = css->parent;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
+ * @return: true if this bio needs to be submitted with the root blkg context.
+ *
+ * In order to avoid priority inversions we sometimes need to issue a bio as if
+ * it were attached to the root blkg, and then backcharge to the actual owning
+ * blkg. The idea is we do bio_blkcg() to look up the actual context for the
+ * bio and attach the appropriate blkg to the bio. Then we call this helper and
+ * if it is true run with the root blkg for that queue and then do any
+ * backcharging to the originating cgroup once the io is complete.
+ */
+static inline bool bio_issue_as_root_blkg(struct bio *bio)
+{
+ return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0;
+}
+
/**
* blkcg_parent - get the parent of a blkcg
* @blkcg: blkcg of interest
@@ -296,6 +342,17 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
}
/**
+ * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for @q at the root level. See also blkg_lookup().
+ */
+static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
+{
+ return q->root_blkg;
+}
+
+/**
* blkg_to_pdata - get policy private data
* @blkg: blkg of interest
* @pol: policy of interest
@@ -355,6 +412,21 @@ static inline void blkg_get(struct blkcg_gq *blkg)
atomic_inc(&blkg->refcnt);
}
+/**
+ * blkg_try_get - try and get a blkg reference
+ * @blkg: blkg to get
+ *
+ * This is for use when doing an RCU lookup of the blkg. We may be in the midst
+ * of freeing this blkg, so we can only use it if the refcnt is not zero.
+ */
+static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg)
+{
+ if (atomic_inc_not_zero(&blkg->refcnt))
+ return blkg;
+ return NULL;
+}
+
+
void __blkg_release_rcu(struct rcu_head *rcu);
/**
@@ -589,7 +661,9 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
{
struct percpu_counter *cnt;
- if (op_is_write(op))
+ if (op_is_discard(op))
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD];
+ else if (op_is_write(op))
cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
else
cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
@@ -706,8 +780,14 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
if (!throtl) {
blkg = blkg ?: q->root_blkg;
- blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf,
- bio->bi_iter.bi_size);
+ /*
+ * If the bio is flagged with BIO_QUEUE_ENTERED it means this
+ * is a split bio and we would have already accounted for the
+ * size of the bio.
+ */
+ if (!bio_flagged(bio, BIO_QUEUE_ENTERED))
+ blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf,
+ bio->bi_iter.bi_size);
blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1);
}
@@ -715,6 +795,59 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
return !throtl;
}
+static inline void blkcg_use_delay(struct blkcg_gq *blkg)
+{
+ if (atomic_add_return(1, &blkg->use_delay) == 1)
+ atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+}
+
+static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
+{
+ int old = atomic_read(&blkg->use_delay);
+
+ if (old == 0)
+ return 0;
+
+ /*
+ * We do this song and dance because we can race with somebody else
+ * adding or removing delay. If we just did an atomic_dec we'd end up
+ * negative and we'd already be in trouble. We need to subtract 1 and
+ * then check to see if we were the last delay so we can drop the
+ * congestion count on the cgroup.
+ */
+ while (old) {
+ int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
+ if (cur == old)
+ break;
+ old = cur;
+ }
+
+ if (old == 0)
+ return 0;
+ if (old == 1)
+ atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+ return 1;
+}
+
+static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
+{
+ int old = atomic_read(&blkg->use_delay);
+ if (!old)
+ return;
+ /* We only want 1 person clearing the congestion count for this blkg. */
+ while (old) {
+ int cur = atomic_cmpxchg(&blkg->use_delay, old, 0);
+ if (cur == old) {
+ atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+ break;
+ }
+ old = cur;
+ }
+}
+
+void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
+void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
+void blkcg_maybe_throttle_current(void);
#else /* CONFIG_BLK_CGROUP */
struct blkcg {
@@ -734,9 +867,16 @@ struct blkcg_policy {
#define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
+static inline void blkcg_maybe_throttle_current(void) { }
+static inline bool blk_cgroup_congested(void) { return false; }
+
#ifdef CONFIG_BLOCK
+static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { }
+
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
+static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
+{ return NULL; }
static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
static inline void blkcg_drain_queue(struct request_queue *q) { }
static inline void blkcg_exit_queue(struct request_queue *q) { }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ca3f2c2edd85..1da59c16f637 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -35,10 +35,12 @@ struct blk_mq_hw_ctx {
struct sbitmap ctx_map;
struct blk_mq_ctx *dispatch_from;
+ unsigned int dispatch_busy;
- struct blk_mq_ctx **ctxs;
unsigned int nr_ctx;
+ struct blk_mq_ctx **ctxs;
+ spinlock_t dispatch_wait_lock;
wait_queue_entry_t dispatch_wait;
atomic_t wait_index;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 3c4f390aea4b..f6dfb30737d8 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -179,11 +179,9 @@ struct bio {
*/
struct io_context *bi_ioc;
struct cgroup_subsys_state *bi_css;
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- void *bi_cg_private;
+ struct blkcg_gq *bi_blkg;
struct bio_issue bi_issue;
#endif
-#endif
union {
#if defined(CONFIG_BLK_DEV_INTEGRITY)
struct bio_integrity_payload *bi_integrity; /* data integrity */
@@ -329,7 +327,7 @@ enum req_flag_bits {
/* for driver use */
__REQ_DRV,
-
+ __REQ_SWAP, /* swapping request. */
__REQ_NR_BITS, /* stops here */
};
@@ -351,6 +349,7 @@ enum req_flag_bits {
#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
#define REQ_DRV (1ULL << __REQ_DRV)
+#define REQ_SWAP (1ULL << __REQ_SWAP)
#define REQ_FAILFAST_MASK \
(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
@@ -358,6 +357,14 @@ enum req_flag_bits {
#define REQ_NOMERGE_FLAGS \
(REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA)
+enum stat_group {
+ STAT_READ,
+ STAT_WRITE,
+ STAT_DISCARD,
+
+ NR_STAT_GROUPS
+};
+
#define bio_op(bio) \
((bio)->bi_opf & REQ_OP_MASK)
#define req_op(req) \
@@ -395,6 +402,18 @@ static inline bool op_is_sync(unsigned int op)
(op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH));
}
+static inline bool op_is_discard(unsigned int op)
+{
+ return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
+}
+
+static inline int op_stat_group(unsigned int op)
+{
+ if (op_is_discard(op))
+ return STAT_DISCARD;
+ return op_is_write(op);
+}
+
typedef unsigned int blk_qc_t;
#define BLK_QC_T_NONE -1U
#define BLK_QC_T_SHIFT 16
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 79226ca8f80f..d6869e0e2b64 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -27,8 +27,6 @@
#include <linux/percpu-refcount.h>
#include <linux/scatterlist.h>
#include <linux/blkzoned.h>
-#include <linux/seqlock.h>
-#include <linux/u64_stats_sync.h>
struct module;
struct scsi_ioctl_command;
@@ -42,7 +40,7 @@ struct bsg_job;
struct blkcg_gq;
struct blk_flush_queue;
struct pr_ops;
-struct rq_wb;
+struct rq_qos;
struct blk_queue_stats;
struct blk_stat_callback;
@@ -442,10 +440,8 @@ struct request_queue {
int nr_rqs[2]; /* # allocated [a]sync rqs */
int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
- atomic_t shared_hctx_restart;
-
struct blk_queue_stats *stats;
- struct rq_wb *rq_wb;
+ struct rq_qos *rq_qos;
/*
* If blkcg is not used, @q->root_rl serves all requests. If blkcg
@@ -592,6 +588,7 @@ struct request_queue {
struct queue_limits limits;
+#ifdef CONFIG_BLK_DEV_ZONED
/*
* Zoned block device information for request dispatch control.
* nr_zones is the total number of zones of the device. This is always
@@ -612,6 +609,7 @@ struct request_queue {
unsigned int nr_zones;
unsigned long *seq_zones_bitmap;
unsigned long *seq_zones_wlock;
+#endif /* CONFIG_BLK_DEV_ZONED */
/*
* sg stuff
@@ -800,11 +798,7 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q)
return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
}
-static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
-{
- return q->nr_zones;
-}
-
+#ifdef CONFIG_BLK_DEV_ZONED
static inline unsigned int blk_queue_zone_no(struct request_queue *q,
sector_t sector)
{
@@ -820,6 +814,7 @@ static inline bool blk_queue_zone_is_seq(struct request_queue *q,
return false;
return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap);
}
+#endif /* CONFIG_BLK_DEV_ZONED */
static inline bool rq_is_sync(struct request *rq)
{
@@ -1070,6 +1065,7 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
}
+#ifdef CONFIG_BLK_DEV_ZONED
static inline unsigned int blk_rq_zone_no(struct request *rq)
{
return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
@@ -1079,6 +1075,7 @@ static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
{
return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
}
+#endif /* CONFIG_BLK_DEV_ZONED */
/*
* Some commands like WRITE SAME have a payload or data transfer size which
@@ -1437,8 +1434,6 @@ enum blk_default_limits {
BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL,
};
-#define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
-
static inline unsigned long queue_segment_boundary(struct request_queue *q)
{
return q->limits.seg_boundary_mask;
@@ -1639,15 +1634,6 @@ static inline unsigned int bdev_zone_sectors(struct block_device *bdev)
return 0;
}
-static inline unsigned int bdev_nr_zones(struct block_device *bdev)
-{
- struct request_queue *q = bdev_get_queue(bdev);
-
- if (q)
- return blk_queue_nr_zones(q);
- return 0;
-}
-
static inline int queue_dma_alignment(struct request_queue *q)
{
return q ? q->dma_alignment : 511;
@@ -1877,6 +1863,28 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
bip_next->bip_vec[0].bv_offset);
}
+/**
+ * bio_integrity_intervals - Return number of integrity intervals for a bio
+ * @bi: blk_integrity profile for device
+ * @sectors: Size of the bio in 512-byte sectors
+ *
+ * Description: The block layer calculates everything in 512 byte
+ * sectors but integrity metadata is done in terms of the data integrity
+ * interval size of the storage device. Convert the block layer sectors
+ * to the appropriate number of integrity intervals.
+ */
+static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
+ unsigned int sectors)
+{
+ return sectors >> (bi->interval_exp - 9);
+}
+
+static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
+ unsigned int sectors)
+{
+ return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
+}
+
#else /* CONFIG_BLK_DEV_INTEGRITY */
struct bio;
@@ -1950,12 +1958,24 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
return false;
}
+static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
+ unsigned int sectors)
+{
+ return 0;
+}
+
+static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
+ unsigned int sectors)
+{
+ return 0;
+}
+
#endif /* CONFIG_BLK_DEV_INTEGRITY */
struct block_device_operations {
int (*open) (struct block_device *, fmode_t);
void (*release) (struct gendisk *, fmode_t);
- int (*rw_page)(struct block_device *, sector_t, struct page *, bool);
+ int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int);
int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
unsigned int (*check_events) (struct gendisk *disk,
diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h
index e75dfd1f1dec..528271c60018 100644
--- a/include/linux/cdrom.h
+++ b/include/linux/cdrom.h
@@ -13,6 +13,7 @@
#include <linux/fs.h> /* not really needed, later.. */
#include <linux/list.h>
+#include <scsi/scsi_common.h>
#include <uapi/linux/cdrom.h>
struct packet_command
@@ -21,7 +22,7 @@ struct packet_command
unsigned char *buffer;
unsigned int buflen;
int stat;
- struct request_sense *sense;
+ struct scsi_sense_hdr *sshdr;
unsigned char data_direction;
int quiet;
int timeout;
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index c0e68f903011..ff20b677fb9f 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -438,6 +438,9 @@ struct cgroup {
/* used to store eBPF programs */
struct cgroup_bpf bpf;
+ /* If there is block congestion on this cgroup. */
+ atomic_t congestion_count;
+
/* ids of the ancestors at each level including self */
int ancestor_ids[];
};
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 6cb8a5789668..57864422a2c8 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -16,6 +16,7 @@
#include <linux/slab.h>
#include <linux/percpu-refcount.h>
#include <linux/uuid.h>
+#include <linux/blk_types.h>
#ifdef CONFIG_BLOCK
@@ -82,10 +83,10 @@ struct partition {
} __attribute__((packed));
struct disk_stats {
- unsigned long sectors[2]; /* READs and WRITEs */
- unsigned long ios[2];
- unsigned long merges[2];
- unsigned long ticks[2];
+ unsigned long sectors[NR_STAT_GROUPS];
+ unsigned long ios[NR_STAT_GROUPS];
+ unsigned long merges[NR_STAT_GROUPS];
+ unsigned long ticks[NR_STAT_GROUPS];
unsigned long io_ticks;
unsigned long time_in_queue;
};
@@ -353,6 +354,11 @@ static inline void free_part_stats(struct hd_struct *part)
#endif /* CONFIG_SMP */
+#define part_stat_read_accum(part, field) \
+ (part_stat_read(part, field[STAT_READ]) + \
+ part_stat_read(part, field[STAT_WRITE]) + \
+ part_stat_read(part, field[STAT_DISCARD]))
+
#define part_stat_add(cpu, part, field, addnd) do { \
__part_stat_add((cpu), (part), field, addnd); \
if ((part)->partno) \
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6c6fb116e925..680d3395fc83 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -317,6 +317,9 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask, struct mem_cgroup **memcgp,
bool compound);
+int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask, struct mem_cgroup **memcgp,
+ bool compound);
void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
bool lrucare, bool compound);
void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
@@ -789,6 +792,16 @@ static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
return 0;
}
+static inline int mem_cgroup_try_charge_delay(struct page *page,
+ struct mm_struct *mm,
+ gfp_t gfp_mask,
+ struct mem_cgroup **memcgp,
+ bool compound)
+{
+ *memcgp = NULL;
+ return 0;
+}
+
static inline void mem_cgroup_commit_charge(struct page *page,
struct mem_cgroup *memcg,
bool lrucare, bool compound)
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 2950ce957656..68e91ef5494c 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -242,7 +242,12 @@ struct nvme_id_ctrl {
__le32 sanicap;
__le32 hmminds;
__le16 hmmaxd;
- __u8 rsvd338[174];
+ __u8 rsvd338[4];
+ __u8 anatt;
+ __u8 anacap;
+ __le32 anagrpmax;
+ __le32 nanagrpid;
+ __u8 rsvd352[160];
__u8 sqes;
__u8 cqes;
__le16 maxcmd;
@@ -254,11 +259,12 @@ struct nvme_id_ctrl {
__le16 awun;
__le16 awupf;
__u8 nvscc;
- __u8 rsvd531;
+ __u8 nwpc;
__le16 acwu;
__u8 rsvd534[2];
__le32 sgls;
- __u8 rsvd540[228];
+ __le32 mnan;
+ __u8 rsvd544[224];
char subnqn[256];
__u8 rsvd1024[768];
__le32 ioccsz;
@@ -312,7 +318,11 @@ struct nvme_id_ns {
__le16 nabspf;
__le16 noiob;
__u8 nvmcap[16];
- __u8 rsvd64[40];
+ __u8 rsvd64[28];
+ __le32 anagrpid;
+ __u8 rsvd96[3];
+ __u8 nsattr;
+ __u8 rsvd100[4];
__u8 nguid[16];
__u8 eui64[8];
struct nvme_lbaf lbaf[16];
@@ -425,6 +435,32 @@ struct nvme_effects_log {
__u8 resv[2048];
};
+enum nvme_ana_state {
+ NVME_ANA_OPTIMIZED = 0x01,
+ NVME_ANA_NONOPTIMIZED = 0x02,
+ NVME_ANA_INACCESSIBLE = 0x03,
+ NVME_ANA_PERSISTENT_LOSS = 0x04,
+ NVME_ANA_CHANGE = 0x0f,
+};
+
+struct nvme_ana_group_desc {
+ __le32 grpid;
+ __le32 nnsids;
+ __le64 chgcnt;
+ __u8 state;
+ __u8 rsvd17[15];
+ __le32 nsids[];
+};
+
+/* flag for the log specific field of the ANA log */
+#define NVME_ANA_LOG_RGO (1 << 0)
+
+struct nvme_ana_rsp_hdr {
+ __le64 chgcnt;
+ __le16 ngrps;
+ __le16 rsvd10[3];
+};
+
enum {
NVME_SMART_CRIT_SPARE = 1 << 0,
NVME_SMART_CRIT_TEMPERATURE = 1 << 1,
@@ -444,11 +480,13 @@ enum {
enum {
NVME_AER_NOTICE_NS_CHANGED = 0x00,
NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
+ NVME_AER_NOTICE_ANA = 0x03,
};
enum {
NVME_AEN_CFG_NS_ATTR = 1 << 8,
NVME_AEN_CFG_FW_ACT = 1 << 9,
+ NVME_AEN_CFG_ANA_CHANGE = 1 << 11,
};
struct nvme_lba_range_type {
@@ -749,15 +787,22 @@ enum {
NVME_FEAT_HOST_MEM_BUF = 0x0d,
NVME_FEAT_TIMESTAMP = 0x0e,
NVME_FEAT_KATO = 0x0f,
+ NVME_FEAT_HCTM = 0x10,
+ NVME_FEAT_NOPSC = 0x11,
+ NVME_FEAT_RRL = 0x12,
+ NVME_FEAT_PLM_CONFIG = 0x13,
+ NVME_FEAT_PLM_WINDOW = 0x14,
NVME_FEAT_SW_PROGRESS = 0x80,
NVME_FEAT_HOST_ID = 0x81,
NVME_FEAT_RESV_MASK = 0x82,
NVME_FEAT_RESV_PERSIST = 0x83,
+ NVME_FEAT_WRITE_PROTECT = 0x84,
NVME_LOG_ERROR = 0x01,
NVME_LOG_SMART = 0x02,
NVME_LOG_FW_SLOT = 0x03,
NVME_LOG_CHANGED_NS = 0x04,
NVME_LOG_CMD_EFFECTS = 0x05,
+ NVME_LOG_ANA = 0x0c,
NVME_LOG_DISC = 0x70,
NVME_LOG_RESERVATION = 0x80,
NVME_FWACT_REPL = (0 << 3),
@@ -765,6 +810,14 @@ enum {
NVME_FWACT_ACTV = (2 << 3),
};
+/* NVMe Namespace Write Protect State */
+enum {
+ NVME_NS_NO_WRITE_PROTECT = 0,
+ NVME_NS_WRITE_PROTECT,
+ NVME_NS_WRITE_PROTECT_POWER_CYCLE,
+ NVME_NS_WRITE_PROTECT_PERMANENT,
+};
+
#define NVME_MAX_CHANGED_NAMESPACES 1024
struct nvme_identify {
@@ -880,7 +933,7 @@ struct nvme_get_log_page_command {
__u64 rsvd2[2];
union nvme_data_ptr dptr;
__u8 lid;
- __u8 rsvd10;
+ __u8 lsp; /* upper 4 bits reserved */
__le16 numdl;
__le16 numdu;
__u16 rsvd11;
@@ -1111,6 +1164,8 @@ enum {
NVME_SC_SGL_INVALID_OFFSET = 0x16,
NVME_SC_SGL_INVALID_SUBTYPE = 0x17,
+ NVME_SC_NS_WRITE_PROTECTED = 0x20,
+
NVME_SC_LBA_RANGE = 0x80,
NVME_SC_CAP_EXCEEDED = 0x81,
NVME_SC_NS_NOT_READY = 0x82,
@@ -1180,6 +1235,13 @@ enum {
NVME_SC_ACCESS_DENIED = 0x286,
NVME_SC_UNWRITTEN_BLOCK = 0x287,
+ /*
+ * Path-related Errors:
+ */
+ NVME_SC_ANA_PERSISTENT_LOSS = 0x301,
+ NVME_SC_ANA_INACCESSIBLE = 0x302,
+ NVME_SC_ANA_TRANSITION = 0x303,
+
NVME_SC_DNR = 0x4000,
};
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dac5086e3815..95a5018c338e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -734,6 +734,10 @@ struct task_struct {
/* disallow userland-initiated cgroup migration */
unsigned no_cgroup_migration:1;
#endif
+#ifdef CONFIG_BLK_CGROUP
+ /* to be used once the psi infrastructure lands upstream. */
+ unsigned use_memdelay:1;
+#endif
unsigned long atomic_flags; /* Flags requiring atomic access. */
@@ -1150,6 +1154,10 @@ struct task_struct {
unsigned int memcg_nr_pages_over_high;
#endif
+#ifdef CONFIG_BLK_CGROUP
+ struct request_queue *throttle_queue;
+#endif
+
#ifdef CONFIG_UPROBES
struct uprobe_task *utask;
#endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index c063443d8638..1a8bd05a335e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -629,7 +629,6 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
return memcg->swappiness;
}
-
#else
static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
{
@@ -637,6 +636,16 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
}
#endif
+#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+extern void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
+ gfp_t gfp_mask);
+#else
+static inline void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg,
+ int node, gfp_t gfp_mask)
+{
+}
+#endif
+
#ifdef CONFIG_MEMCG_SWAP
extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h
index c6aa8a3c42ed..b9626aa7e90c 100644
--- a/include/linux/t10-pi.h
+++ b/include/linux/t10-pi.h
@@ -37,9 +37,33 @@ struct t10_pi_tuple {
#define T10_PI_APP_ESCAPE cpu_to_be16(0xffff)
#define T10_PI_REF_ESCAPE cpu_to_be32(0xffffffff)
+static inline u32 t10_pi_ref_tag(struct request *rq)
+{
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ return blk_rq_pos(rq) >>
+ (rq->q->integrity.interval_exp - 9) & 0xffffffff;
+#else
+ return -1U;
+#endif
+}
+
extern const struct blk_integrity_profile t10_pi_type1_crc;
extern const struct blk_integrity_profile t10_pi_type1_ip;
extern const struct blk_integrity_profile t10_pi_type3_crc;
extern const struct blk_integrity_profile t10_pi_type3_ip;
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+extern void t10_pi_prepare(struct request *rq, u8 protection_type);
+extern void t10_pi_complete(struct request *rq, u8 protection_type,
+ unsigned int intervals);
+#else
+static inline void t10_pi_complete(struct request *rq, u8 protection_type,
+ unsigned int intervals)
+{
+}
+static inline void t10_pi_prepare(struct request *rq, u8 protection_type)
+{
+}
+#endif
+
#endif
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 4a8841963c2e..05589a3e37f4 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -51,6 +51,7 @@
#include <linux/security.h>
#include <linux/task_work.h>
#include <linux/memcontrol.h>
+#include <linux/blk-cgroup.h>
struct linux_binprm;
/*
@@ -192,6 +193,7 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
task_work_run();
mem_cgroup_handle_over_high();
+ blkcg_maybe_throttle_current();
}
#endif /* <linux/tracehook.h> */
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index aaf1e971c6a3..c891ada3c5c2 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -4,6 +4,7 @@
#include <linux/dma-mapping.h>
#include <linux/blkdev.h>
+#include <linux/t10-pi.h>
#include <linux/list.h>
#include <linux/types.h>
#include <linux/timer.h>
@@ -14,8 +15,6 @@
struct Scsi_Host;
struct scsi_driver;
-#include <scsi/scsi_device.h>
-
/*
* MAX_COMMAND_SIZE is:
* The longest fixed-length SCSI CDB as per the SCSI standard.
@@ -120,11 +119,11 @@ struct scsi_cmnd {
struct request *request; /* The command we are
working on */
-#define SCSI_SENSE_BUFFERSIZE 96
unsigned char *sense_buffer;
/* obtained by REQUEST SENSE when
* CHECK CONDITION is received on original
- * command (auto-sense) */
+ * command (auto-sense). Length must be
+ * SCSI_SENSE_BUFFERSIZE bytes. */
/* Low-level done function - can be used by low-level driver to point
* to completion function. Not used by mid/upper level code. */
@@ -313,12 +312,6 @@ static inline unsigned int scsi_prot_interval(struct scsi_cmnd *scmd)
return scmd->device->sector_size;
}
-static inline u32 scsi_prot_ref_tag(struct scsi_cmnd *scmd)
-{
- return blk_rq_pos(scmd->request) >>
- (ilog2(scsi_prot_interval(scmd)) - 9) & 0xffffffff;
-}
-
static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd)
{
return cmd->prot_sdb ? cmd->prot_sdb->table.nents : 0;
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 4c36af6edd79..202f4d6a4342 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -17,6 +17,8 @@ struct scsi_sense_hdr;
typedef __u64 __bitwise blist_flags_t;
+#define SCSI_SENSE_BUFFERSIZE 96
+
struct scsi_mode_data {
__u32 length;
__u16 block_descriptor_length;
@@ -426,11 +428,21 @@ extern const char *scsi_device_state_name(enum scsi_device_state);
extern int scsi_is_sdev_device(const struct device *);
extern int scsi_is_target_device(const struct device *);
extern void scsi_sanitize_inquiry_string(unsigned char *s, int len);
-extern int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
+extern int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
int data_direction, void *buffer, unsigned bufflen,
unsigned char *sense, struct scsi_sense_hdr *sshdr,
int timeout, int retries, u64 flags,
req_flags_t rq_flags, int *resid);
+/* Make sure any sense buffer is the correct size. */
+#define scsi_execute(sdev, cmd, data_direction, buffer, bufflen, sense, \
+ sshdr, timeout, retries, flags, rq_flags, resid) \
+({ \
+ BUILD_BUG_ON((sense) != NULL && \
+ sizeof(sense) != SCSI_SENSE_BUFFERSIZE); \
+ __scsi_execute(sdev, cmd, data_direction, buffer, bufflen, \
+ sense, sshdr, timeout, retries, flags, rq_flags, \
+ resid); \
+})
static inline int scsi_execute_req(struct scsi_device *sdev,
const unsigned char *cmd, int data_direction, void *buffer,
unsigned bufflen, struct scsi_sense_hdr *sshdr, int timeout,
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index 821f71a2e48f..8d19e02d752a 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -195,7 +195,7 @@ struct cache_sb {
};
};
- __u32 last_mount; /* time_t */
+ __u32 last_mount; /* time overflow in y2106 */
__u16 first_bucket;
union {
@@ -318,7 +318,7 @@ struct uuid_entry {
struct {
__u8 uuid[16];
__u8 label[32];
- __u32 first_reg;
+ __u32 first_reg; /* time overflow in y2106 */
__u32 last_reg;
__u32 invalidated;
diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
index e3c70fe6bf0f..ff5a5db8906a 100644
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -117,7 +117,7 @@ struct blk_zone_report {
__u32 nr_zones;
__u8 reserved[4];
struct blk_zone zones[0];
-} __packed;
+};
/**
* struct blk_zone_range - BLKRESETZONE ioctl request
diff --git a/kernel/fork.c b/kernel/fork.c
index 9d8d0e016fc6..33112315b5c0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -866,6 +866,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->fail_nth = 0;
#endif
+#ifdef CONFIG_BLK_CGROUP
+ tsk->throttle_queue = NULL;
+ tsk->use_memdelay = 0;
+#endif
+
return tsk;
free_stack:
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 987d9a9ae283..b951aa1fac61 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -494,6 +494,9 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (!buts->buf_size || !buts->buf_nr)
return -EINVAL;
+ if (!blk_debugfs_root)
+ return -ENOENT;
+
strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
@@ -518,9 +521,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
ret = -ENOENT;
- if (!blk_debugfs_root)
- goto err;
-
dir = debugfs_lookup(buts->name, blk_debugfs_root);
if (!dir)
bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 25346bd99364..a9e1e093df51 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -552,7 +552,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
+ if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -1142,7 +1142,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
vmf->address, page_to_nid(page));
if (unlikely(!pages[i] ||
- mem_cgroup_try_charge(pages[i], vma->vm_mm,
+ mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
GFP_KERNEL, &memcg, false))) {
if (pages[i])
put_page(pages[i]);
@@ -1312,7 +1312,7 @@ alloc:
goto out;
}
- if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
+ if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
huge_gfp, &memcg, true))) {
put_page(new_page);
split_huge_pmd(vma, vmf->pmd, vmf->address);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b2173f7e5164..b836e7f00309 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5600,6 +5600,19 @@ out:
return ret;
}
+int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask, struct mem_cgroup **memcgp,
+ bool compound)
+{
+ struct mem_cgroup *memcg;
+ int ret;
+
+ ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
+ memcg = *memcgp;
+ mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
+ return ret;
+}
+
/**
* mem_cgroup_commit_charge - commit a page charge
* @page: page to charge
diff --git a/mm/memory.c b/mm/memory.c
index 6d175057cfd0..348279ff6e51 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2524,7 +2524,7 @@ static int wp_page_copy(struct vm_fault *vmf)
cow_user_page(new_page, old_page, vmf->address, vma);
}
- if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
goto oom_free_new;
__SetPageUptodate(new_page);
@@ -3024,8 +3024,8 @@ int do_swap_page(struct vm_fault *vmf)
goto out_page;
}
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
- &memcg, false)) {
+ if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL,
+ &memcg, false)) {
ret = VM_FAULT_OOM;
goto out_page;
}
@@ -3186,7 +3186,8 @@ static int do_anonymous_page(struct vm_fault *vmf)
if (!page)
goto oom;
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
+ false))
goto oom_free_page;
/*
@@ -3682,7 +3683,7 @@ static int do_cow_fault(struct vm_fault *vmf)
if (!vmf->cow_page)
return VM_FAULT_OOM;
- if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
+ if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
&vmf->memcg, false)) {
put_page(vmf->cow_page);
return VM_FAULT_OOM;
diff --git a/mm/page_io.c b/mm/page_io.c
index b41cf9644585..aafd19ec1db4 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -338,7 +338,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
ret = -ENOMEM;
goto out;
}
- bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
+ bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
+ bio_associate_blkcg_from_page(bio, page);
count_swpout_vm_event(page);
set_page_writeback(page);
unlock_page(page);
diff --git a/mm/readahead.c b/mm/readahead.c
index e273f0de3376..a59ea70527b9 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -19,6 +19,7 @@
#include <linux/syscalls.h>
#include <linux/file.h>
#include <linux/mm_inline.h>
+#include <linux/blk-cgroup.h>
#include "internal.h"
@@ -385,6 +386,7 @@ ondemand_readahead(struct address_space *mapping,
{
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long max_pages = ra->ra_pages;
+ unsigned long add_pages;
pgoff_t prev_offset;
/*
@@ -474,10 +476,17 @@ readit:
* Will this read hit the readahead marker made by itself?
* If so, trigger the readahead marker hit now, and merge
* the resulted next readahead window into the current one.
+ * Take care of maximum IO pages as above.
*/
if (offset == ra->start && ra->size == ra->async_size) {
- ra->async_size = get_next_ra_size(ra, max_pages);
- ra->size += ra->async_size;
+ add_pages = get_next_ra_size(ra, max_pages);
+ if (ra->size + add_pages <= max_pages) {
+ ra->async_size = add_pages;
+ ra->size += add_pages;
+ } else {
+ ra->size = max_pages;
+ ra->async_size = max_pages >> 1;
+ }
}
return ra_submit(ra, mapping, filp);
@@ -505,6 +514,9 @@ void page_cache_sync_readahead(struct address_space *mapping,
if (!ra->ra_pages)
return;
+ if (blk_cgroup_congested())
+ return;
+
/* be dumb */
if (filp && (filp->f_mode & FMODE_RANDOM)) {
force_page_cache_readahead(mapping, filp, offset, req_size);
@@ -555,6 +567,9 @@ page_cache_async_readahead(struct address_space *mapping,
if (inode_read_congested(mapping->host))
return;
+ if (blk_cgroup_congested())
+ return;
+
/* do read-ahead */
ondemand_readahead(mapping, ra, filp, true, offset, req_size);
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 96bcc51fb9ec..06ebe17bb924 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1239,8 +1239,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
* the shmem_swaplist_mutex which might hold up shmem_writepage().
* Charged back to the user (not to caller) when swap account is used.
*/
- error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg,
- false);
+ error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL,
+ &memcg, false);
if (error)
goto out;
/* No radix_tree_preload: swap entry keeps a place for page in tree */
@@ -1713,7 +1713,7 @@ repeat:
goto failed;
}
- error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
+ error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
false);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
@@ -1819,7 +1819,7 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode,
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
- error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
+ error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
PageTransHuge(page));
if (error)
goto unacct;
@@ -2292,7 +2292,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
__SetPageSwapBacked(page);
__SetPageUptodate(page);
- ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false);
+ ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
if (ret)
goto out_release;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 18185ae4f223..8837b22c848d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3745,6 +3745,37 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
}
}
+#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
+ gfp_t gfp_mask)
+{
+ struct swap_info_struct *si, *next;
+ if (!(gfp_mask & __GFP_IO) || !memcg)
+ return;
+
+ if (!blk_cgroup_congested())
+ return;
+
+ /*
+ * We've already scheduled a throttle, avoid taking the global swap
+ * lock.
+ */
+ if (current->throttle_queue)
+ return;
+
+ spin_lock(&swap_avail_lock);
+ plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
+ avail_lists[node]) {
+ if (si->bdev) {
+ blkcg_schedule_throttle(bdev_get_queue(si->bdev),
+ true);
+ break;
+ }
+ }
+ spin_unlock(&swap_avail_lock);
+}
+#endif
+
static int __init swapfile_init(void)
{
int nid;