From 5014c311baa2b21384321fa4a9f617a92e3e56f0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 2 Sep 2015 16:46:02 -0600
Subject: block: fix bogus compiler warnings in blk-merge.c

The compiler can't figure out that bvprv is initialized whenever 'prev'
is set to 1 as well. Use a pointer to bvprv instead, setting it to NULL
initially, and get rid of the 'prev' tracking. This dumbs it down
enough that gcc is happy.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-merge.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index d088cffb8105..cce23ba1ae5f 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -67,10 +67,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 					 struct bio_set *bs)
 {
 	struct bio *split;
-	struct bio_vec bv, bvprv;
+	struct bio_vec bv, bvprv, *bvprvp = NULL;
 	struct bvec_iter iter;
 	unsigned seg_size = 0, nsegs = 0, sectors = 0;
-	int prev = 0;
 
 	bio_for_each_segment(bv, bio, iter) {
 		sectors += bv.bv_len >> 9;
@@ -82,20 +81,20 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 		 * If the queue doesn't support SG gaps and adding this
 		 * offset would create a gap, disallow it.
 		 */
-		if (prev && bvec_gap_to_prev(q, &bvprv, bv.bv_offset))
+		if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
 			goto split;
 
-		if (prev && blk_queue_cluster(q)) {
+		if (bvprvp && blk_queue_cluster(q)) {
 			if (seg_size + bv.bv_len > queue_max_segment_size(q))
 				goto new_segment;
-			if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv))
+			if (!BIOVEC_PHYS_MERGEABLE(bvprvp, &bv))
 				goto new_segment;
-			if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv))
+			if (!BIOVEC_SEG_BOUNDARY(q, bvprvp, &bv))
 				goto new_segment;
 
 			seg_size += bv.bv_len;
 			bvprv = bv;
-			prev = 1;
+			bvprvp = &bv;
 			continue;
 		}
 new_segment:
@@ -104,7 +103,7 @@ new_segment:
 
 		nsegs++;
 		bvprv = bv;
-		prev = 1;
+		bvprvp = &bv;
 		seg_size = bv.bv_len;
 	}
 
-- 
cgit 


From 5e7c4274a70aa2d6f485996d0ca1dad52d0039ca Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 3 Sep 2015 19:28:20 +0300
Subject: block: Check for gaps on front and back merges

We are checking for gaps to previous bio_vec, which can
only detect back merges gaps. Moreover, at the point where
we check for a gap, we don't know if we will attempt a back
or a front merge. Thus, check for gap to prev in a back merge
attempt and check for a gap to next in a front merge attempt.

Signed-off-by: Jens Axboe <axboe@fb.com>
[sagig: Minor rename change]
Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
---
 block/blk-merge.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

(limited to 'block')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index cce23ba1ae5f..d9eddbc189f5 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -438,6 +438,8 @@ no_merge:
 int ll_back_merge_fn(struct request_queue *q, struct request *req,
 		     struct bio *bio)
 {
+	if (req_gap_back_merge(req, bio))
+		return 0;
 	if (blk_rq_sectors(req) + bio_sectors(bio) >
 	    blk_rq_get_max_sectors(req)) {
 		req->cmd_flags |= REQ_NOMERGE;
@@ -456,6 +458,9 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 int ll_front_merge_fn(struct request_queue *q, struct request *req,
 		      struct bio *bio)
 {
+
+	if (req_gap_front_merge(req, bio))
+		return 0;
 	if (blk_rq_sectors(req) + bio_sectors(bio) >
 	    blk_rq_get_max_sectors(req)) {
 		req->cmd_flags |= REQ_NOMERGE;
@@ -482,14 +487,6 @@ static bool req_no_special_merge(struct request *req)
 	return !q->mq_ops && req->special;
 }
 
-static int req_gap_to_prev(struct request *req, struct bio *next)
-{
-	struct bio *prev = req->biotail;
-
-	return bvec_gap_to_prev(req->q, &prev->bi_io_vec[prev->bi_vcnt - 1],
-			next->bi_io_vec[0].bv_offset);
-}
-
 static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 				struct request *next)
 {
@@ -504,7 +501,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	if (req_no_special_merge(req) || req_no_special_merge(next))
 		return 0;
 
-	if (req_gap_to_prev(req, next->bio))
+	if (req_gap_back_merge(req, next->bio))
 		return 0;
 
 	/*
@@ -712,10 +709,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	    !blk_write_same_mergeable(rq->bio, bio))
 		return false;
 
-	/* Only check gaps if the bio carries data */
-	if (bio_has_data(bio) && req_gap_to_prev(rq, bio))
-		return false;
-
 	return true;
 }
 
-- 
cgit 


From 7f39add3b08cbbdb99abe50e6d7c342e6800d684 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Fri, 11 Sep 2015 09:03:04 -0600
Subject: block: Refuse request/bio merges with gaps in the integrity payload

If a driver sets the block queue virtual boundary mask, it means that
it cannot handle gaps so we must not allow those in the integrity
payload as well.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>

Fixed up by me to have duplicate integrity merge functions, depending
on whether block integrity is enabled or not. Fixes a compilations
issue with CONFIG_BLK_DEV_INTEGRITY unset.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-integrity.c | 3 +++
 block/blk-merge.c     | 6 ++++++
 2 files changed, 9 insertions(+)

(limited to 'block')

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index f548b64be092..75f29cf70188 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -204,6 +204,9 @@ bool blk_integrity_merge_rq(struct request_queue *q, struct request *req,
 	    q->limits.max_integrity_segments)
 		return false;
 
+	if (integrity_req_gap_back_merge(req, next->bio))
+		return false;
+
 	return true;
 }
 EXPORT_SYMBOL(blk_integrity_merge_rq);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index d9eddbc189f5..574ea7c0468f 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -440,6 +440,9 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 {
 	if (req_gap_back_merge(req, bio))
 		return 0;
+	if (blk_integrity_rq(req) &&
+	    integrity_req_gap_back_merge(req, bio))
+		return 0;
 	if (blk_rq_sectors(req) + bio_sectors(bio) >
 	    blk_rq_get_max_sectors(req)) {
 		req->cmd_flags |= REQ_NOMERGE;
@@ -461,6 +464,9 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 
 	if (req_gap_front_merge(req, bio))
 		return 0;
+	if (blk_integrity_rq(req) &&
+	    integrity_req_gap_front_merge(req, bio))
+		return 0;
 	if (blk_rq_sectors(req) + bio_sectors(bio) >
 	    blk_rq_get_max_sectors(req)) {
 		req->cmd_flags |= REQ_NOMERGE;
-- 
cgit 


From 87a816df537e096d404add543ef47b796906c130 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Tue, 8 Sep 2015 09:33:35 -0600
Subject: block: Refuse adding appending a gapped integrity page to a bio

This is only theoretical at the moment given that the only
subsystems that generate integrity payloads are the block layer
itself and the scsi target (which generate well aligned integrity
payloads). But when we will expose integrity meta-data to user-space,
we'll need to refuse appending a page with a gap (if the queue
virtual boundary is set).

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio-integrity.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'block')

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 4aecca79374a..14b8faf8b09d 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -140,6 +140,11 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 
 	iv = bip->bip_vec + bip->bip_vcnt;
 
+	if (bip->bip_vcnt &&
+	    bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
+			     &bip->bip_vec[bip->bip_vcnt - 1], offset))
+		return 0;
+
 	iv->bv_page = page;
 	iv->bv_len = len;
 	iv->bv_offset = offset;
-- 
cgit 


From 46348456c1791053dcbe5a9e21825b10a3c8a8fb Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Thu, 3 Sep 2015 19:28:23 +0300
Subject: block: Copy a user iovec if it includes gaps

For drivers that don't support gaps in the SG lists handed to
them we must bounce (copy the user buffers) and pass a bio that
does not include gaps. This doesn't matter for any current user,
but will help to allow iser which can't handle gaps to use the
block virtual boundary instead of using driver-local bounce
buffering when handling SG_IO commands.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-map.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-map.c b/block/blk-map.c
index 233841644c9d..f565e11f465a 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -9,6 +9,24 @@
 
 #include "blk.h"
 
+static bool iovec_gap_to_prv(struct request_queue *q,
+			     struct iovec *prv, struct iovec *cur)
+{
+	unsigned long prev_end;
+
+	if (!queue_virt_boundary(q))
+		return false;
+
+	if (prv->iov_base == NULL && prv->iov_len == 0)
+		/* prv is not set - don't check */
+		return false;
+
+	prev_end = (unsigned long)(prv->iov_base + prv->iov_len);
+
+	return (((unsigned long)cur->iov_base & queue_virt_boundary(q)) ||
+		prev_end & queue_virt_boundary(q));
+}
+
 int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 		      struct bio *bio)
 {
@@ -67,7 +85,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 	struct bio *bio;
 	int unaligned = 0;
 	struct iov_iter i;
-	struct iovec iov;
+	struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0};
 
 	if (!iter || !iter->count)
 		return -EINVAL;
@@ -81,8 +99,12 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 		/*
 		 * Keep going so we check length of all segments
 		 */
-		if (uaddr & queue_dma_alignment(q))
+		if ((uaddr & queue_dma_alignment(q)) ||
+		    iovec_gap_to_prv(q, &prv, &iov))
 			unaligned = 1;
+
+		prv.iov_base = iov.iov_base;
+		prv.iov_len = iov.iov_len;
 	}
 
 	if (unaligned || (q->dma_pad_mask & iter->count) || map_data)
-- 
cgit 


From 6fe810bda0bd9a5d7674fc671fac27b8aa8ec243 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 5 Sep 2015 15:47:36 -0400
Subject: block: blkg_destroy_all() should clear q->root_blkg and
 ->root_rl.blkg

While making the root blkg unconditional, ec13b1d6f0a0 ("blkcg: always
create the blkcg_gq for the root blkcg") removed the part which clears
q->root_blkg and ->root_rl.blkg during q exit.  This leaves the two
pointers dangling after blkg_destroy_all().  blk-throttle exit path
performs blkg traversals and dereferences ->root_blkg and can lead to
the following oops.

 BUG: unable to handle kernel NULL pointer dereference at 0000000000000558
 IP: [<ffffffff81389746>] __blkg_lookup+0x26/0x70
 ...
 task: ffff88001b4e2580 ti: ffff88001ac0c000 task.ti: ffff88001ac0c000
 RIP: 0010:[<ffffffff81389746>]  [<ffffffff81389746>] __blkg_lookup+0x26/0x70
 ...
 Call Trace:
  [<ffffffff8138d14a>] blk_throtl_drain+0x5a/0x110
  [<ffffffff8138a108>] blkcg_drain_queue+0x18/0x20
  [<ffffffff81369a70>] __blk_drain_queue+0xc0/0x170
  [<ffffffff8136a101>] blk_queue_bypass_start+0x61/0x80
  [<ffffffff81388c59>] blkcg_deactivate_policy+0x39/0x100
  [<ffffffff8138d328>] blk_throtl_exit+0x38/0x50
  [<ffffffff8138a14e>] blkcg_exit_queue+0x3e/0x50
  [<ffffffff8137016e>] blk_release_queue+0x1e/0xc0
 ...

While the bug is a straigh-forward use-after-free bug, it is tricky to
reproduce because blkg release is RCU protected and the rest of exit
path usually finishes before RCU grace period.

This patch fixes the bug by updating blkg_destro_all() to clear
q->root_blkg and ->root_rl.blkg.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: "Richard W.M. Jones" <rjones@redhat.com>
Reported-by: Josh Boyer <jwboyer@fedoraproject.org>
Link: http://lkml.kernel.org/g/CA+5PVA5rzQ0s4723n5rHBcxQa9t0cW8BPPBekr_9aMRoWt2aYg@mail.gmail.com
Fixes: ec13b1d6f0a0 ("blkcg: always create the blkcg_gq for the root blkcg")
Cc: stable@vger.kernel.org # v4.2+
Tested-by: Richard W.M. Jones <rjones@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d6283b3f5db5..9cc48d1d7abb 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -387,6 +387,9 @@ static void blkg_destroy_all(struct request_queue *q)
 		blkg_destroy(blkg);
 		spin_unlock(&blkcg->lock);
 	}
+
+	q->root_blkg = NULL;
+	q->root_rl.blkg = NULL;
 }
 
 /*
-- 
cgit 


From 52cc6eead9095e2faf2ec7afc013aa3af1f01ac5 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Thu, 17 Sep 2015 09:58:38 -0600
Subject: block: blk-merge: fast-clone bio when splitting rw bios

biovecs has become immutable since v3.13, so it isn't necessary
to allocate biovecs for the new cloned bios, then we can save
one extra biovecs allocation/copy, and the allocation is often
not fixed-length and a bit more expensive.

For example, if the 'max_sectors_kb' of null blk's queue is set
as 16(32 sectors) via sysfs just for making more splits, this patch
can increase throught about ~70% in the sequential read test over
null_blk(direct io, bs: 1M).

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Ming Lin <ming.l@ssi.samsung.com>
Cc: Dongsu Park <dpark@posteo.net>
Signed-off-by: Ming Lei <ming.lei@canonical.com>

This fixes a performance regression introduced by commit 54efd50bfd,
and allows us to take full advantage of the fact that we have immutable
bio_vecs. Hand applied, as it rejected violently with commit
5014c311baa2.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-merge.c | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

(limited to 'block')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 574ea7c0468f..c4e9c37f3e38 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -66,15 +66,12 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 					 struct bio *bio,
 					 struct bio_set *bs)
 {
-	struct bio *split;
 	struct bio_vec bv, bvprv, *bvprvp = NULL;
 	struct bvec_iter iter;
 	unsigned seg_size = 0, nsegs = 0, sectors = 0;
 
 	bio_for_each_segment(bv, bio, iter) {
-		sectors += bv.bv_len >> 9;
-
-		if (sectors > queue_max_sectors(q))
+		if (sectors + (bv.bv_len >> 9) > queue_max_sectors(q))
 			goto split;
 
 		/*
@@ -95,6 +92,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 			seg_size += bv.bv_len;
 			bvprv = bv;
 			bvprvp = &bv;
+			sectors += bv.bv_len >> 9;
 			continue;
 		}
 new_segment:
@@ -105,21 +103,12 @@ new_segment:
 		bvprv = bv;
 		bvprvp = &bv;
 		seg_size = bv.bv_len;
+		sectors += bv.bv_len >> 9;
 	}
 
 	return NULL;
 split:
-	split = bio_clone_bioset(bio, GFP_NOIO, bs);
-
-	split->bi_iter.bi_size -= iter.bi_size;
-	bio->bi_iter = iter;
-
-	if (bio_integrity(bio)) {
-		bio_integrity_advance(bio, split->bi_iter.bi_size);
-		bio_integrity_trim(split, 0, bio_sectors(split));
-	}
-
-	return split;
+	return bio_split(bio, sectors, GFP_NOIO, bs);
 }
 
 void blk_queue_split(struct request_queue *q, struct bio **bio,
-- 
cgit 


From 994518799930fc363d47cb7cf0d1abed1790bf16 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Fri, 18 Sep 2015 00:06:28 +0800
Subject: block: fix bounce_end_io

When bio bounce is involved, one new bio and its biovecs are
cloned from the comming bio, which can be one fast-cloned bio
from upper layer(such as dm).

So it is obviously wrong to assume the start index of the coming(
original) bio's io vector is zero, which can be any value between
0 and (bi_max_vecs - 1), especially in case of bio split.

This patch fixes Fedora's booting oops on i386, often with the
following kernel log together:

> [    9.026738] systemd[1]: Switching root.
> [    9.036467] systemd-journald[149]: Received SIGTERM from PID 1
> (systemd).
> [    9.082262] BUG: Bad page state in process kworker/u5:1  pfn:372ac
> [    9.083989] page:f3d32ae0 count:0 mapcount:0 mapping:f2252178
> index:0x16a
> [    9.085755] flags: 0x40020021(locked|lru|mappedtodisk)
> [    9.087284] page dumped because: page still charged to cgroup
> [    9.088772] bad because of flags:
> [    9.089731] flags: 0x21(locked|lru)
> [    9.090818] page->mem_cgroup:f2c3e400

Reported-by: Josh Boyer <jwboyer@fedoraproject.org>
Tested-by: Adam Williamson <awilliam@redhat.com>
Cc: Ming Lin <mlin@kernel.org>
Cc: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bounce.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/bounce.c b/block/bounce.c
index 2c310ea007ee..457ebd57542f 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -128,12 +128,14 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool)
 	struct bio *bio_orig = bio->bi_private;
 	struct bio_vec *bvec, *org_vec;
 	int i;
+	int start = bio_orig->bi_iter.bi_idx;
 
 	/*
 	 * free up bounce indirect pages used
 	 */
 	bio_for_each_segment_all(bvec, bio, i) {
-		org_vec = bio_orig->bi_io_vec + i;
+		org_vec = bio_orig->bi_io_vec + i + start;
+
 		if (bvec->bv_page == org_vec->bv_page)
 			continue;
 
-- 
cgit