From 5235553d821433e1f4fa720fd025d2c4b7ee9994 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 5 Feb 2018 13:16:56 -0700 Subject: blk-wbt: account flush requests correctly Mikulas reported a workload that saw bad performance, and figured out what it was due to various other types of requests being accounted as reads. Flush requests, for instance. Due to the high latency of those, we heavily throttle the writes to keep the latencies in balance. But they really should be accounted as writes. Fix this by checking the exact type of the request. If it's a read, account as a read, if it's a write or a flush, account as a write. Any other request we disregard. Previously everything would have been mistakenly accounted as reads. Reported-by: Mikulas Patocka Cc: stable@vger.kernel.org # v4.12+ Signed-off-by: Jens Axboe --- block/blk-wbt.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index ae8de9780085..f92fc84b5e2c 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -697,7 +697,15 @@ u64 wbt_default_latency_nsec(struct request_queue *q) static int wbt_data_dir(const struct request *rq) { - return rq_data_dir(rq); + const int op = req_op(rq); + + if (op == REQ_OP_READ) + return READ; + else if (op == REQ_OP_WRITE || op == REQ_OP_FLUSH) + return WRITE; + + /* don't account */ + return -1; } int wbt_init(struct request_queue *q) -- cgit From 30abb3a67f4b2aa160feeb3c0b771f730cbcca67 Mon Sep 17 00:00:00 2001 From: Howard McLauchlan Date: Tue, 6 Feb 2018 14:05:39 -0800 Subject: block: Add should_fail_bio() for bpf error injection The classic error injection mechanism, should_fail_request() does not support use cases where more information is required (from the entire struct bio, for example). To that end, this patch introduces should_fail_bio(), which calls should_fail_request() under the hood but provides a convenient place for kprobes to hook into if they require the entire struct bio. This patch also replaces some existing calls to should_fail_request() with should_fail_bio() with no degradation in performance. Signed-off-by: Howard McLauchlan Signed-off-by: Jens Axboe --- block/blk-core.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index d0d104268f1a..2d1a7bbe0634 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -34,6 +34,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -2083,6 +2084,14 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) return false; } +static noinline int should_fail_bio(struct bio *bio) +{ + if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) + return -EIO; + return 0; +} +ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO); + /* * Remap block n of partition p to block n+start(p) of the disk. */ @@ -2174,7 +2183,7 @@ generic_make_request_checks(struct bio *bio) if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) goto not_supported; - if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) + if (should_fail_bio(bio)) goto end_io; if (!bio->bi_partno) { -- cgit From a728eacbbdd229d1d903e46261c57d5206f87a4a Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 7 Feb 2018 11:41:39 -0800 Subject: bcache: add journal statistic Sometimes, Journal takes up a lot of CPU, we need statistics to know what's the journal is doing. So this patch provide some journal statistics: 1) reclaim: how many times the journal try to reclaim resource, usually the journal bucket or/and the pin are exhausted. 2) flush_write: how many times the journal try to flush btree node to cache device, usually the journal bucket are exhausted. 3) retry_flush_write: how many times the journal retry to flush the next btree node, usually the previous tree node have been flushed by other thread. we show these statistic by sysfs interface. Through these statistics We can totally see the status of journal module when the CPU is too high. Signed-off-by: Tang Junhui Reviewed-by: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 4 ++++ drivers/md/bcache/journal.c | 5 +++++ drivers/md/bcache/sysfs.c | 15 +++++++++++++++ 3 files changed, 24 insertions(+) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 5e2d4e80198e..b98d7705acb6 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -658,6 +658,10 @@ struct cache_set { atomic_long_t writeback_keys_done; atomic_long_t writeback_keys_failed; + atomic_long_t reclaim; + atomic_long_t flush_write; + atomic_long_t retry_flush_write; + enum { ON_ERROR_UNREGISTER, ON_ERROR_PANIC, diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index a87165c1d8e5..f5296007a9d5 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -377,6 +377,8 @@ static void btree_flush_write(struct cache_set *c) */ struct btree *b, *best; unsigned i; + + atomic_long_inc(&c->flush_write); retry: best = NULL; @@ -397,6 +399,7 @@ retry: if (!btree_current_write(b)->journal) { mutex_unlock(&b->write_lock); /* We raced */ + atomic_long_inc(&c->retry_flush_write); goto retry; } @@ -476,6 +479,8 @@ static void journal_reclaim(struct cache_set *c) unsigned iter, n = 0; atomic_t p; + atomic_long_inc(&c->reclaim); + while (!atomic_read(&fifo_front(&c->journal.pin))) fifo_pop(&c->journal.pin, p); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index b4184092c727..46e7a6b3e7c7 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -65,6 +65,9 @@ read_attribute(bset_tree_stats); read_attribute(state); read_attribute(cache_read_races); +read_attribute(reclaim); +read_attribute(flush_write); +read_attribute(retry_flush_write); read_attribute(writeback_keys_done); read_attribute(writeback_keys_failed); read_attribute(io_errors); @@ -545,6 +548,15 @@ SHOW(__bch_cache_set) sysfs_print(cache_read_races, atomic_long_read(&c->cache_read_races)); + sysfs_print(reclaim, + atomic_long_read(&c->reclaim)); + + sysfs_print(flush_write, + atomic_long_read(&c->flush_write)); + + sysfs_print(retry_flush_write, + atomic_long_read(&c->retry_flush_write)); + sysfs_print(writeback_keys_done, atomic_long_read(&c->writeback_keys_done)); sysfs_print(writeback_keys_failed, @@ -731,6 +743,9 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_bset_tree_stats, &sysfs_cache_read_races, + &sysfs_reclaim, + &sysfs_flush_write, + &sysfs_retry_flush_write, &sysfs_writeback_keys_done, &sysfs_writeback_keys_failed, -- cgit From c4dc2497d50d9c6fb16aa0d07b6a14f3b2adb1e0 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 7 Feb 2018 11:41:40 -0800 Subject: bcache: fix high CPU occupancy during journal After long time small writing I/O running, we found the occupancy of CPU is very high and I/O performance has been reduced by about half: [root@ceph151 internal]# top top - 15:51:05 up 1 day,2:43, 4 users, load average: 16.89, 15.15, 16.53 Tasks: 2063 total, 4 running, 2059 sleeping, 0 stopped, 0 zombie %Cpu(s):4.3 us, 17.1 sy 0.0 ni, 66.1 id, 12.0 wa, 0.0 hi, 0.5 si, 0.0 st KiB Mem : 65450044 total, 24586420 free, 38909008 used, 1954616 buff/cache KiB Swap: 65667068 total, 65667068 free, 0 used. 25136812 avail Mem PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 2023 root 20 0 0 0 0 S 55.1 0.0 0:04.42 kworker/11:191 14126 root 20 0 0 0 0 S 42.9 0.0 0:08.72 kworker/10:3 9292 root 20 0 0 0 0 S 30.4 0.0 1:10.99 kworker/6:1 8553 ceph 20 0 4242492 1.805g 18804 S 30.0 2.9 410:07.04 ceph-osd 12287 root 20 0 0 0 0 S 26.7 0.0 0:28.13 kworker/7:85 31019 root 20 0 0 0 0 S 26.1 0.0 1:30.79 kworker/22:1 1787 root 20 0 0 0 0 R 25.7 0.0 5:18.45 kworker/8:7 32169 root 20 0 0 0 0 S 14.5 0.0 1:01.92 kworker/23:1 21476 root 20 0 0 0 0 S 13.9 0.0 0:05.09 kworker/1:54 2204 root 20 0 0 0 0 S 12.5 0.0 1:25.17 kworker/9:10 16994 root 20 0 0 0 0 S 12.2 0.0 0:06.27 kworker/5:106 15714 root 20 0 0 0 0 R 10.9 0.0 0:01.85 kworker/19:2 9661 ceph 20 0 4246876 1.731g 18800 S 10.6 2.8 403:00.80 ceph-osd 11460 ceph 20 0 4164692 2.206g 18876 S 10.6 3.5 360:27.19 ceph-osd 9960 root 20 0 0 0 0 S 10.2 0.0 0:02.75 kworker/2:139 11699 ceph 20 0 4169244 1.920g 18920 S 10.2 3.1 355:23.67 ceph-osd 6843 ceph 20 0 4197632 1.810g 18900 S 9.6 2.9 380:08.30 ceph-osd The kernel work consumed a lot of CPU, and I found they are running journal work, The journal is reclaiming source and flush btree node with surprising frequency. Through further analysis, we found that in btree_flush_write(), we try to get a btree node with the smallest fifo idex to flush by traverse all the btree nodein c->bucket_hash, after we getting it, since no locker protects it, this btree node may have been written to cache device by other works, and if this occurred, we retry to traverse in c->bucket_hash and get another btree node. When the problem occurrd, the retry times is very high, and we consume a lot of CPU in looking for a appropriate btree node. In this patch, we try to record 128 btree nodes with the smallest fifo idex in heap, and pop one by one when we need to flush btree node. It greatly reduces the time for the loop to find the appropriate BTREE node, and also reduce the occupancy of CPU. [note by mpl: this triggers a checkpatch error because of adjacent, pre-existing style violations] Signed-off-by: Tang Junhui Reviewed-by: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 2 ++ drivers/md/bcache/journal.c | 47 ++++++++++++++++++++++++++++++--------------- drivers/md/bcache/util.h | 2 ++ 3 files changed, 36 insertions(+), 15 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index b98d7705acb6..a857eb3c10de 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -679,6 +679,8 @@ struct cache_set { #define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; + + DECLARE_HEAP(struct btree *, flush_btree); }; struct bbio { diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index f5296007a9d5..1b736b860739 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -368,6 +368,12 @@ err: } /* Journalling */ +#define journal_max_cmp(l, r) \ + (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \ + fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) +#define journal_min_cmp(l, r) \ + (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \ + fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) static void btree_flush_write(struct cache_set *c) { @@ -375,25 +381,35 @@ static void btree_flush_write(struct cache_set *c) * Try to find the btree node with that references the oldest journal * entry, best is our current candidate and is locked if non NULL: */ - struct btree *b, *best; - unsigned i; + struct btree *b; + int i; atomic_long_inc(&c->flush_write); + retry: - best = NULL; - - for_each_cached_btree(b, c, i) - if (btree_current_write(b)->journal) { - if (!best) - best = b; - else if (journal_pin_cmp(c, - btree_current_write(best)->journal, - btree_current_write(b)->journal)) { - best = b; + spin_lock(&c->journal.lock); + if (heap_empty(&c->flush_btree)) { + for_each_cached_btree(b, c, i) + if (btree_current_write(b)->journal) { + if (!heap_full(&c->flush_btree)) + heap_add(&c->flush_btree, b, + journal_max_cmp); + else if (journal_max_cmp(b, + heap_peek(&c->flush_btree))) { + c->flush_btree.data[0] = b; + heap_sift(&c->flush_btree, 0, + journal_max_cmp); + } } - } - b = best; + for (i = c->flush_btree.used / 2 - 1; i >= 0; --i) + heap_sift(&c->flush_btree, i, journal_min_cmp); + } + + b = NULL; + heap_pop(&c->flush_btree, b, journal_min_cmp); + spin_unlock(&c->journal.lock); + if (b) { mutex_lock(&b->write_lock); if (!btree_current_write(b)->journal) { @@ -824,7 +840,8 @@ int bch_journal_alloc(struct cache_set *c) j->w[0].c = c; j->w[1].c = c; - if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || + if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) || + !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) return -ENOMEM; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 4df4c5c1cab2..a6763db7f061 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -112,6 +112,8 @@ do { \ #define heap_full(h) ((h)->used == (h)->size) +#define heap_empty(h) ((h)->used == 0) + #define DECLARE_FIFO(type, name) \ struct { \ size_t front, back, size, mask; \ -- cgit From 99361bbf26337186f02561109c17a4c4b1a7536a Mon Sep 17 00:00:00 2001 From: Coly Li Date: Wed, 7 Feb 2018 11:41:41 -0800 Subject: bcache: properly set task state in bch_writeback_thread() Kernel thread routine bch_writeback_thread() has the following code block, 447 down_write(&dc->writeback_lock); 448~450 if (check conditions) { 451 up_write(&dc->writeback_lock); 452 set_current_state(TASK_INTERRUPTIBLE); 453 454 if (kthread_should_stop()) 455 return 0; 456 457 schedule(); 458 continue; 459 } If condition check is true, its task state is set to TASK_INTERRUPTIBLE and call schedule() to wait for others to wake up it. There are 2 issues in current code, 1, Task state is set to TASK_INTERRUPTIBLE after the condition checks, if another process changes the condition and call wake_up_process(dc-> writeback_thread), then at line 452 task state is set back to TASK_INTERRUPTIBLE, the writeback kernel thread will lose a chance to be waken up. 2, At line 454 if kthread_should_stop() is true, writeback kernel thread will return to kernel/kthread.c:kthread() with TASK_INTERRUPTIBLE and call do_exit(). It is not good to enter do_exit() with task state TASK_INTERRUPTIBLE, in following code path might_sleep() is called and a warning message is reported by __might_sleep(): "WARNING: do not call blocking ops when !TASK_RUNNING; state=1 set at [xxxx]". For the first issue, task state should be set before condition checks. Ineed because dc->writeback_lock is required when modifying all the conditions, calling set_current_state() inside code block where dc-> writeback_lock is hold is safe. But this is quite implicit, so I still move set_current_state() before all the condition checks. For the second issue, frankley speaking it does not hurt when kernel thread exits with TASK_INTERRUPTIBLE state, but this warning message scares users, makes them feel there might be something risky with bcache and hurt their data. Setting task state to TASK_RUNNING before returning fixes this problem. In alloc.c:allocator_wait(), there is also a similar issue, and is also fixed in this patch. Changelog: v3: merge two similar fixes into one patch v2: fix the race issue in v1 patch. v1: initial buggy fix. Signed-off-by: Coly Li Reviewed-by: Hannes Reinecke Reviewed-by: Michael Lyle Cc: Michael Lyle Cc: Junhui Tang Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 4 +++- drivers/md/bcache/writeback.c | 7 +++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 6cc6c0f9c3a9..458e1d38577d 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -287,8 +287,10 @@ do { \ break; \ \ mutex_unlock(&(ca)->set->bucket_lock); \ - if (kthread_should_stop()) \ + if (kthread_should_stop()) { \ + set_current_state(TASK_RUNNING); \ return 0; \ + } \ \ schedule(); \ mutex_lock(&(ca)->set->bucket_lock); \ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 51306a19ab03..58218f7e77c3 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -564,18 +564,21 @@ static int bch_writeback_thread(void *arg) while (!kthread_should_stop()) { down_write(&dc->writeback_lock); + set_current_state(TASK_INTERRUPTIBLE); if (!atomic_read(&dc->has_dirty) || (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && !dc->writeback_running)) { up_write(&dc->writeback_lock); - set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); return 0; + } schedule(); continue; } + set_current_state(TASK_RUNNING); searched_full_index = refill_dirty(dc); -- cgit From 7ba0d830dc0e4c7e88602c91656029b6ae8a1766 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Wed, 7 Feb 2018 11:41:42 -0800 Subject: bcache: set error_limit correctly Struct cache uses io_errors for two purposes, - Error decay: when cache set error_decay is set, io_errors is used to generate a small piece of delay when I/O error happens. - I/O errors counter: in order to generate big enough value for error decay, I/O errors counter value is stored by left shifting 20 bits (a.k.a IO_ERROR_SHIFT). In function bch_count_io_errors(), if I/O errors counter reaches cache set error limit, bch_cache_set_error() will be called to retire the whold cache set. But current code is problematic when checking the error limit, see the following code piece from bch_count_io_errors(), 90 if (error) { 91 char buf[BDEVNAME_SIZE]; 92 unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT, 93 &ca->io_errors); 94 errors >>= IO_ERROR_SHIFT; 95 96 if (errors < ca->set->error_limit) 97 pr_err("%s: IO error on %s, recovering", 98 bdevname(ca->bdev, buf), m); 99 else 100 bch_cache_set_error(ca->set, 101 "%s: too many IO errors %s", 102 bdevname(ca->bdev, buf), m); 103 } At line 94, errors is right shifting IO_ERROR_SHIFT bits, now it is real errors counter to compare at line 96. But ca->set->error_limit is initia- lized with an amplified value in bch_cache_set_alloc(), 1545 c->error_limit = 8 << IO_ERROR_SHIFT; It means by default, in bch_count_io_errors(), before 8<<20 errors happened bch_cache_set_error() won't be called to retire the problematic cache device. If the average request size is 64KB, it means bcache won't handle failed device until 512GB data is requested. This is too large to be an I/O threashold. So I believe the correct error limit should be much less. This patch sets default cache set error limit to 8, then in bch_count_io_errors() when errors counter reaches 8 (if it is default value), function bch_cache_set_error() will be called to retire the whole cache set. This patch also removes bits shifting when store or show io_error_limit value via sysfs interface. Nowadays most of SSDs handle internal flash failure automatically by LBA address re-indirect mapping. If an I/O error can be observed by upper layer code, it will be a notable error because that SSD can not re-indirect map the problematic LBA address to an available flash block. This situation indicates the whole SSD will be failed very soon. Therefore setting 8 as the default io error limit value makes sense, it is enough for most of cache devices. Changelog: v2: add reviewed-by from Hannes. v1: initial version for review. Signed-off-by: Coly Li Reviewed-by: Hannes Reinecke Reviewed-by: Tang Junhui Reviewed-by: Michael Lyle Cc: Junhui Tang Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/super.c | 2 +- drivers/md/bcache/sysfs.c | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index a857eb3c10de..b8c2e1bef1f1 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -666,6 +666,7 @@ struct cache_set { ON_ERROR_UNREGISTER, ON_ERROR_PANIC, } on_error; +#define DEFAULT_IO_ERROR_LIMIT 8 unsigned error_limit; unsigned error_decay; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 133b81225ea9..e29150ec17e7 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1553,7 +1553,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->congested_read_threshold_us = 2000; c->congested_write_threshold_us = 20000; - c->error_limit = 8 << IO_ERROR_SHIFT; + c->error_limit = DEFAULT_IO_ERROR_LIMIT; return c; err: diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 46e7a6b3e7c7..c524305cc9a7 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -568,7 +568,7 @@ SHOW(__bch_cache_set) /* See count_io_errors for why 88 */ sysfs_print(io_error_halflife, c->error_decay * 88); - sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); + sysfs_print(io_error_limit, c->error_limit); sysfs_hprint(congested, ((uint64_t) bch_get_congested(c)) << 9); @@ -668,7 +668,7 @@ STORE(__bch_cache_set) } if (attr == &sysfs_io_error_limit) - c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; + c->error_limit = strtoul_or_return(buf); /* See count_io_errors() for why 88 */ if (attr == &sysfs_io_error_halflife) -- cgit From 682811b3ce1a5a4e20d700939a9042f01dbc66c4 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 7 Feb 2018 11:41:43 -0800 Subject: bcache: fix for allocator and register thread race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After long time running of random small IO writing, I reboot the machine, and after the machine power on, I found bcache got stuck, the stack is: [root@ceph153 ~]# cat /proc/2510/task/*/stack [] closure_sync+0x25/0x90 [bcache] [] bch_journal+0x118/0x2b0 [bcache] [] bch_journal_meta+0x47/0x70 [bcache] [] bch_prio_write+0x237/0x340 [bcache] [] bch_allocator_thread+0x3c8/0x3d0 [bcache] [] kthread+0xcf/0xe0 [] ret_from_fork+0x58/0x90 [] 0xffffffffffffffff [root@ceph153 ~]# cat /proc/2038/task/*/stack [] __bch_btree_map_nodes+0x12d/0x150 [bcache] [] bch_btree_insert+0xf1/0x170 [bcache] [] bch_journal_replay+0x13f/0x230 [bcache] [] run_cache_set+0x79a/0x7c2 [bcache] [] register_bcache+0xd48/0x1310 [bcache] [] kobj_attr_store+0xf/0x20 [] sysfs_write_file+0xc6/0x140 [] vfs_write+0xbd/0x1e0 [] SyS_write+0x7f/0xe0 [] system_call_fastpath+0x16/0x1 The stack shows the register thread and allocator thread were getting stuck when registering cache device. I reboot the machine several times, the issue always exsit in this machine. I debug the code, and found the call trace as bellow: register_bcache() ==>run_cache_set() ==>bch_journal_replay() ==>bch_btree_insert() ==>__bch_btree_map_nodes() ==>btree_insert_fn() ==>btree_split() //node need split ==>btree_check_reserve() In btree_check_reserve(), It will check if there is enough buckets of RESERVE_BTREE type, since allocator thread did not work yet, so no buckets of RESERVE_BTREE type allocated, so the register thread waits on c->btree_cache_wait, and goes to sleep. Then the allocator thread initialized, the call trace is bellow: bch_allocator_thread() ==>bch_prio_write() ==>bch_journal_meta() ==>bch_journal() ==>journal_wait_for_write() In journal_wait_for_write(), It will check if journal is full by journal_full(), but the long time random small IO writing causes the exhaustion of journal buckets(journal.blocks_free=0), In order to release the journal buckets, the allocator calls btree_flush_write() to flush keys to btree nodes, and waits on c->journal.wait until btree nodes writing over or there has already some journal buckets space, then the allocator thread goes to sleep. but in btree_flush_write(), since bch_journal_replay() is not finished, so no btree nodes have journal (condition "if (btree_current_write(b)->journal)" never satisfied), so we got no btree node to flush, no journal bucket released, and allocator sleep all the times. Through the above analysis, we can see that: 1) Register thread wait for allocator thread to allocate buckets of RESERVE_BTREE type; 2) Alloctor thread wait for register thread to replay journal, so it can flush btree nodes and get journal bucket. then they are all got stuck by waiting for each other. Hua Rui provided a patch for me, by allocating some buckets of RESERVE_BTREE type in advance, so the register thread can get bucket when btree node splitting and no need to waiting for the allocator thread. I tested it, it has effect, and register thread run a step forward, but finally are still got stuck, the reason is only 8 bucket of RESERVE_BTREE type were allocated, and in bch_journal_replay(), after 2 btree nodes splitting, only 4 bucket of RESERVE_BTREE type left, then btree_check_reserve() is not satisfied anymore, so it goes to sleep again, and in the same time, alloctor thread did not flush enough btree nodes to release a journal bucket, so they all got stuck again. So we need to allocate more buckets of RESERVE_BTREE type in advance, but how much is enough? By experience and test, I think it should be as much as journal buckets. Then I modify the code as this patch, and test in the machine, and it works. This patch modified base on Hua Rui’s patch, and allocate more buckets of RESERVE_BTREE type in advance to avoid register thread and allocate thread going to wait for each other. [patch v2] ca->sb.njournal_buckets would be 0 in the first time after cache creation, and no journal exists, so just 8 btree buckets is OK. Signed-off-by: Hua Rui Signed-off-by: Tang Junhui Reviewed-by: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 9 ++++++--- drivers/md/bcache/super.c | 13 ++++++++++++- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index bf3a48aa9a9a..fad9fe8817eb 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1869,14 +1869,17 @@ void bch_initial_gc_finish(struct cache_set *c) */ for_each_cache(ca, c, i) { for_each_bucket(b, ca) { - if (fifo_full(&ca->free[RESERVE_PRIO])) + if (fifo_full(&ca->free[RESERVE_PRIO]) && + fifo_full(&ca->free[RESERVE_BTREE])) break; if (bch_can_invalidate_bucket(ca, b) && !GC_MARK(b)) { __bch_invalidate_one_bucket(ca, b); - fifo_push(&ca->free[RESERVE_PRIO], - b - ca->buckets); + if (!fifo_push(&ca->free[RESERVE_PRIO], + b - ca->buckets)) + fifo_push(&ca->free[RESERVE_BTREE], + b - ca->buckets); } } } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e29150ec17e7..a2ad37a8afc0 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1833,6 +1833,7 @@ void bch_cache_release(struct kobject *kobj) static int cache_alloc(struct cache *ca) { size_t free; + size_t btree_buckets; struct bucket *b; __module_get(THIS_MODULE); @@ -1840,9 +1841,19 @@ static int cache_alloc(struct cache *ca) bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8); + /* + * when ca->sb.njournal_buckets is not zero, journal exists, + * and in bch_journal_replay(), tree node may split, + * so bucket of RESERVE_BTREE type is needed, + * the worst situation is all journal buckets are valid journal, + * and all the keys need to replay, + * so the number of RESERVE_BTREE type buckets should be as much + * as journal buckets + */ + btree_buckets = ca->sb.njournal_buckets ?: 8; free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; - if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) || + if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, GFP_KERNEL) || !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || -- cgit From 7a5e3ecbe5b7b58e9a78a3738b28244982822e1c Mon Sep 17 00:00:00 2001 From: Coly Li Date: Wed, 7 Feb 2018 11:41:44 -0800 Subject: bcache: set writeback_rate_update_seconds in range [1, 60] seconds dc->writeback_rate_update_seconds can be set via sysfs and its value can be set to [1, ULONG_MAX]. It does not make sense to set such a large value, 60 seconds is long enough value considering the default 5 seconds works well for long time. Because dc->writeback_rate_update is a special delayed work, it re-arms itself inside the delayed work routine update_writeback_rate(). When stopping it by cancel_delayed_work_sync(), there should be a timeout to wait and make sure the re-armed delayed work is stopped too. A small max value of dc->writeback_rate_update_seconds is also helpful to decide a reasonable small timeout. This patch limits sysfs interface to set dc->writeback_rate_update_seconds in range of [1, 60] seconds, and replaces the hand-coded number by macros. Changelog: v2: fix a rebase typo in v4, which is pointed out by Michael Lyle. v1: initial version. Signed-off-by: Coly Li Reviewed-by: Hannes Reinecke Reviewed-by: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 4 +++- drivers/md/bcache/writeback.c | 2 +- drivers/md/bcache/writeback.h | 3 +++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index c524305cc9a7..4a6a697e1680 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -218,7 +218,9 @@ STORE(__cached_dev) sysfs_strtoul_clamp(writeback_rate, dc->writeback_rate.rate, 1, INT_MAX); - d_strtoul_nonzero(writeback_rate_update_seconds); + sysfs_strtoul_clamp(writeback_rate_update_seconds, + dc->writeback_rate_update_seconds, + 1, WRITEBACK_RATE_UPDATE_SECS_MAX); d_strtoul(writeback_rate_i_term_inverse); d_strtoul_nonzero(writeback_rate_p_term_inverse); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 58218f7e77c3..f1d2fc15abcc 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -655,7 +655,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_rate.rate = 1024; dc->writeback_rate_minimum = 8; - dc->writeback_rate_update_seconds = 5; + dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; dc->writeback_rate_p_term_inverse = 40; dc->writeback_rate_i_term_inverse = 10000; diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 66f1c527fa24..587b25599856 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -8,6 +8,9 @@ #define MAX_WRITEBACKS_IN_PASS 5 #define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ +#define WRITEBACK_RATE_UPDATE_SECS_MAX 60 +#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5 + /* * 14 (16384ths) is chosen here as something that each backing device * should be a reasonable fraction of the share, and not to blow up -- cgit From 7f4fc93d4713394ee8f1cd44c238e046e11b4f15 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 7 Feb 2018 11:41:45 -0800 Subject: bcache: return attach error when no cache set exist I attach a back-end device to a cache set, and the cache set is not registered yet, this back-end device did not attach successfully, and no error returned: [root]# echo 87859280-fec6-4bcc-20df7ca8f86b > /sys/block/sde/bcache/attach [root]# In sysfs_attach(), the return value "v" is initialized to "size" in the beginning, and if no cache set exist in bch_cache_sets, the "v" value would not change any more, and return to sysfs, sysfs regard it as success since the "size" is a positive number. This patch fixes this issue by assigning "v" with "-ENOENT" in the initialization. Signed-off-by: Tang Junhui Reviewed-by: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 4a6a697e1680..a7552f509f50 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -198,7 +198,7 @@ STORE(__cached_dev) { struct cached_dev *dc = container_of(kobj, struct cached_dev, disk.kobj); - ssize_t v = size; + ssize_t v; struct cache_set *c; struct kobj_uevent_env *env; @@ -275,6 +275,7 @@ STORE(__cached_dev) if (bch_parse_uuid(buf, dc->sb.set_uuid) < 16) return -EINVAL; + v = -ENOENT; list_for_each_entry(c, &bch_cache_sets, list) { v = bch_cached_dev_attach(dc, c); if (!v) @@ -282,7 +283,7 @@ STORE(__cached_dev) } pr_err("Can't attach %s: cache set not found", buf); - size = v; + return v; } if (attr == &sysfs_detach && dc->disk.c) -- cgit From 73ac105be390c1de42a2f21643c9778a5e002930 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 7 Feb 2018 11:41:46 -0800 Subject: bcache: fix for data collapse after re-attaching an attached device back-end device sdm has already attached a cache_set with ID f67ebe1f-f8bc-4d73-bfe5-9dc88607f119, then try to attach with another cache set, and it returns with an error: [root]# cd /sys/block/sdm/bcache [root]# echo 5ccd0a63-148e-48b8-afa2-aca9cbd6279f > attach -bash: echo: write error: Invalid argument After that, execute a command to modify the label of bcache device: [root]# echo data_disk1 > label Then we reboot the system, when the system power on, the back-end device can not attach to cache_set, a messages show in the log: Feb 5 12:05:52 ceph152 kernel: [922385.508498] bcache: bch_cached_dev_attach() couldn't find uuid for sdm in set In sysfs_attach(), dc->sb.set_uuid was assigned to the value which input through sysfs, no matter whether it is success or not in bch_cached_dev_attach(). For example, If the back-end device has already attached to an cache set, bch_cached_dev_attach() would fail, but dc->sb.set_uuid was changed. Then modify the label of bcache device, it will call bch_write_bdev_super(), which would write the dc->sb.set_uuid to the super block, so we record a wrong cache set ID in the super block, after the system reboot, the cache set couldn't find the uuid of the back-end device, so the bcache device couldn't exist and use any more. In this patch, we don't assigned cache set ID to dc->sb.set_uuid in sysfs_attach() directly, but input it into bch_cached_dev_attach(), and assigned dc->sb.set_uuid to the cache set ID after the back-end device attached to the cache set successful. Signed-off-by: Tang Junhui Reviewed-by: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/super.c | 10 ++++++---- drivers/md/bcache/sysfs.c | 6 ++++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index b8c2e1bef1f1..12e5197f186c 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -924,7 +924,7 @@ void bcache_write_super(struct cache_set *); int bch_flash_dev_create(struct cache_set *c, uint64_t size); -int bch_cached_dev_attach(struct cached_dev *, struct cache_set *); +int bch_cached_dev_attach(struct cached_dev *, struct cache_set *, uint8_t *); void bch_cached_dev_detach(struct cached_dev *); void bch_cached_dev_run(struct cached_dev *); void bcache_device_stop(struct bcache_device *); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a2ad37a8afc0..312895788036 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -957,7 +957,8 @@ void bch_cached_dev_detach(struct cached_dev *dc) cached_dev_put(dc); } -int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) +int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + uint8_t *set_uuid) { uint32_t rtime = cpu_to_le32(get_seconds()); struct uuid_entry *u; @@ -965,7 +966,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) bdevname(dc->bdev, buf); - if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)) + if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) || + (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))) return -ENOENT; if (dc->disk.c) { @@ -1194,7 +1196,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, list_add(&dc->list, &uncached_devices); list_for_each_entry(c, &bch_cache_sets, list) - bch_cached_dev_attach(dc, c); + bch_cached_dev_attach(dc, c, NULL); if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) @@ -1716,7 +1718,7 @@ static void run_cache_set(struct cache_set *c) bcache_write_super(c); list_for_each_entry_safe(dc, t, &uncached_devices, list) - bch_cached_dev_attach(dc, c); + bch_cached_dev_attach(dc, c, NULL); flash_devs_run(c); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index a7552f509f50..78cd7bd50fdd 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -272,12 +272,14 @@ STORE(__cached_dev) } if (attr == &sysfs_attach) { - if (bch_parse_uuid(buf, dc->sb.set_uuid) < 16) + uint8_t set_uuid[16]; + + if (bch_parse_uuid(buf, set_uuid) < 16) return -EINVAL; v = -ENOENT; list_for_each_entry(c, &bch_cache_sets, list) { - v = bch_cached_dev_attach(dc, c); + v = bch_cached_dev_attach(dc, c, set_uuid); if (!v) return size; } -- cgit From a7877390614770965a6925dfed79cbd3eeeb61e0 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 7 Feb 2018 22:19:20 +0100 Subject: block, bfq: add requeue-request hook Commit 'a6a252e64914 ("blk-mq-sched: decide how to handle flush rq via RQF_FLUSH_SEQ")' makes all non-flush re-prepared requests for a device be re-inserted into the active I/O scheduler for that device. As a consequence, I/O schedulers may get the same request inserted again, even several times, without a finish_request invoked on that request before each re-insertion. This fact is the cause of the failure reported in [1]. For an I/O scheduler, every re-insertion of the same re-prepared request is equivalent to the insertion of a new request. For schedulers like mq-deadline or kyber, this fact causes no harm. In contrast, it confuses a stateful scheduler like BFQ, which keeps state for an I/O request, until the finish_request hook is invoked on the request. In particular, BFQ may get stuck, waiting forever for the number of request dispatches, of the same request, to be balanced by an equal number of request completions (while there will be one completion for that request). In this state, BFQ may refuse to serve I/O requests from other bfq_queues. The hang reported in [1] then follows. However, the above re-prepared requests undergo a requeue, thus the requeue_request hook of the active elevator is invoked for these requests, if set. This commit then addresses the above issue by properly implementing the hook requeue_request in BFQ. [1] https://marc.info/?l=linux-block&m=151211117608676 Reported-by: Ivan Kozik Reported-by: Alban Browaeys Tested-by: Mike Galbraith Signed-off-by: Paolo Valente Signed-off-by: Serena Ziviani Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 107 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 82 insertions(+), 25 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 47e6ec7427c4..aeca22d91101 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3823,24 +3823,26 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) } /* - * We exploit the bfq_finish_request hook to decrement - * rq_in_driver, but bfq_finish_request will not be - * invoked on this request. So, to avoid unbalance, - * just start this request, without incrementing - * rq_in_driver. As a negative consequence, - * rq_in_driver is deceptively lower than it should be - * while this request is in service. This may cause - * bfq_schedule_dispatch to be invoked uselessly. + * We exploit the bfq_finish_requeue_request hook to + * decrement rq_in_driver, but + * bfq_finish_requeue_request will not be invoked on + * this request. So, to avoid unbalance, just start + * this request, without incrementing rq_in_driver. As + * a negative consequence, rq_in_driver is deceptively + * lower than it should be while this request is in + * service. This may cause bfq_schedule_dispatch to be + * invoked uselessly. * * As for implementing an exact solution, the - * bfq_finish_request hook, if defined, is probably - * invoked also on this request. So, by exploiting - * this hook, we could 1) increment rq_in_driver here, - * and 2) decrement it in bfq_finish_request. Such a - * solution would let the value of the counter be - * always accurate, but it would entail using an extra - * interface function. This cost seems higher than the - * benefit, being the frequency of non-elevator-private + * bfq_finish_requeue_request hook, if defined, is + * probably invoked also on this request. So, by + * exploiting this hook, we could 1) increment + * rq_in_driver here, and 2) decrement it in + * bfq_finish_requeue_request. Such a solution would + * let the value of the counter be always accurate, + * but it would entail using an extra interface + * function. This cost seems higher than the benefit, + * being the frequency of non-elevator-private * requests very low. */ goto start_rq; @@ -4515,6 +4517,8 @@ static inline void bfq_update_insert_stats(struct request_queue *q, unsigned int cmd_flags) {} #endif +static void bfq_prepare_request(struct request *rq, struct bio *bio); + static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head) { @@ -4541,6 +4545,18 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, else list_add_tail(&rq->queuelist, &bfqd->dispatch); } else { + if (WARN_ON_ONCE(!bfqq)) { + /* + * This should never happen. Most likely rq is + * a requeued regular request, being + * re-inserted without being first + * re-prepared. Do a prepare, to avoid + * failure. + */ + bfq_prepare_request(rq, rq->bio); + bfqq = RQ_BFQQ(rq); + } + idle_timer_disabled = __bfq_insert_request(bfqd, rq); /* * Update bfqq, because, if a queue merge has occurred @@ -4697,22 +4713,44 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfq_schedule_dispatch(bfqd); } -static void bfq_finish_request_body(struct bfq_queue *bfqq) +static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) { bfqq->allocated--; bfq_put_queue(bfqq); } -static void bfq_finish_request(struct request *rq) +/* + * Handle either a requeue or a finish for rq. The things to do are + * the same in both cases: all references to rq are to be dropped. In + * particular, rq is considered completed from the point of view of + * the scheduler. + */ +static void bfq_finish_requeue_request(struct request *rq) { - struct bfq_queue *bfqq; + struct bfq_queue *bfqq = RQ_BFQQ(rq); struct bfq_data *bfqd; - if (!rq->elv.icq) + /* + * Requeue and finish hooks are invoked in blk-mq without + * checking whether the involved request is actually still + * referenced in the scheduler. To handle this fact, the + * following two checks make this function exit in case of + * spurious invocations, for which there is nothing to do. + * + * First, check whether rq has nothing to do with an elevator. + */ + if (unlikely(!(rq->rq_flags & RQF_ELVPRIV))) + return; + + /* + * rq either is not associated with any icq, or is an already + * requeued request that has not (yet) been re-inserted into + * a bfq_queue. + */ + if (!rq->elv.icq || !bfqq) return; - bfqq = RQ_BFQQ(rq); bfqd = bfqq->bfqd; if (rq->rq_flags & RQF_STARTED) @@ -4727,13 +4765,14 @@ static void bfq_finish_request(struct request *rq) spin_lock_irqsave(&bfqd->lock, flags); bfq_completed_request(bfqq, bfqd); - bfq_finish_request_body(bfqq); + bfq_finish_requeue_request_body(bfqq); spin_unlock_irqrestore(&bfqd->lock, flags); } else { /* * Request rq may be still/already in the scheduler, - * in which case we need to remove it. And we cannot + * in which case we need to remove it (this should + * never happen in case of requeue). And we cannot * defer such a check and removal, to avoid * inconsistencies in the time interval from the end * of this function to the start of the deferred work. @@ -4748,9 +4787,26 @@ static void bfq_finish_request(struct request *rq) bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); } - bfq_finish_request_body(bfqq); + bfq_finish_requeue_request_body(bfqq); } + /* + * Reset private fields. In case of a requeue, this allows + * this function to correctly do nothing if it is spuriously + * invoked again on this same request (see the check at the + * beginning of the function). Probably, a better general + * design would be to prevent blk-mq from invoking the requeue + * or finish hooks of an elevator, for a request that is not + * referred by that elevator. + * + * Resetting the following fields would break the + * request-insertion logic if rq is re-inserted into a bfq + * internal queue, without a re-preparation. Here we assume + * that re-insertions of requeued requests, without + * re-preparation, can happen only for pass_through or at_head + * requests (which are not re-inserted into bfq internal + * queues). + */ rq->elv.priv[0] = NULL; rq->elv.priv[1] = NULL; } @@ -5426,7 +5482,8 @@ static struct elevator_type iosched_bfq_mq = { .ops.mq = { .limit_depth = bfq_limit_depth, .prepare_request = bfq_prepare_request, - .finish_request = bfq_finish_request, + .requeue_request = bfq_finish_requeue_request, + .finish_request = bfq_finish_requeue_request, .exit_icq = bfq_exit_icq, .insert_requests = bfq_insert_requests, .dispatch_request = bfq_dispatch_request, -- cgit