From 4f8126bb2308066b877859e4b5923ffb54143630 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Sat, 5 Nov 2022 19:10:55 -0400 Subject: sbitmap: Use single per-bitmap counting to wake up queued tags sbitmap suffers from code complexity, as demonstrated by recent fixes, and eventual lost wake ups on nested I/O completion. The later happens, from what I understand, due to the non-atomic nature of the updates to wait_cnt, which needs to be subtracted and eventually reset when equal to zero. This two step process can eventually miss an update when a nested completion happens to interrupt the CPU in between the wait_cnt updates. This is very hard to fix, as shown by the recent changes to this code. The code complexity arises mostly from the corner cases to avoid missed wakes in this scenario. In addition, the handling of wake_batch recalculation plus the synchronization with sbq_queue_wake_up is non-trivial. This patchset implements the idea originally proposed by Jan [1], which removes the need for the two-step updates of wait_cnt. This is done by tracking the number of completions and wakeups in always increasing, per-bitmap counters. Instead of having to reset the wait_cnt when it reaches zero, we simply keep counting, and attempt to wake up N threads in a single wait queue whenever there is enough space for a batch. Waking up less than batch_wake shouldn't be a problem, because we haven't changed the conditions for wake up, and the existing batch calculation guarantees at least enough remaining completions to wake up a batch for each queue at any time. Performance-wise, one should expect very similar performance to the original algorithm for the case where there is no queueing. In both the old algorithm and this implementation, the first thing is to check ws_active, which bails out if there is no queueing to be managed. In the new code, we took care to avoid accounting completions and wakeups when there is no queueing, to not pay the cost of atomic operations unnecessarily, since it doesn't skew the numbers. For more interesting cases, where there is queueing, we need to take into account the cross-communication of the atomic operations. I've been benchmarking by running parallel fio jobs against a single hctx nullb in different hardware queue depth scenarios, and verifying both IOPS and queueing. Each experiment was repeated 5 times on a 20-CPU box, with 20 parallel jobs. fio was issuing fixed-size randwrites with qd=64 against nullb, varying only the hardware queue length per test. queue size 2 4 8 16 32 64 6.1-rc2 1681.1K (1.6K) 2633.0K (12.7K) 6940.8K (16.3K) 8172.3K (617.5K) 8391.7K (367.1K) 8606.1K (351.2K) patched 1721.8K (15.1K) 3016.7K (3.8K) 7543.0K (89.4K) 8132.5K (303.4K) 8324.2K (230.6K) 8401.8K (284.7K) The following is a similar experiment, ran against a nullb with a single bitmap shared by 20 hctx spread across 2 NUMA nodes. This has 40 parallel fio jobs operating on the same device queue size 2 4 8 16 32 64 6.1-rc2 1081.0K (2.3K) 957.2K (1.5K) 1699.1K (5.7K) 6178.2K (124.6K) 12227.9K (37.7K) 13286.6K (92.9K) patched 1081.8K (2.8K) 1316.5K (5.4K) 2364.4K (1.8K) 6151.4K (20.0K) 11893.6K (17.5K) 12385.6K (18.4K) It has also survived blktests and a 12h-stress run against nullb. I also ran the code against nvme and a scsi SSD, and I didn't observe performance regression in those. If there are other tests you think I should run, please let me know and I will follow up with results. [1] https://lore.kernel.org/all/aef9de29-e9f5-259a-f8be-12d1b734e72@google.com/ Cc: Hugh Dickins Cc: Keith Busch Cc: Liu Song Suggested-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20221105231055.25953-1-krisman@suse.de Signed-off-by: Jens Axboe --- lib/sbitmap.c | 122 +++++++++++++--------------------------------------------- 1 file changed, 26 insertions(+), 96 deletions(-) (limited to 'lib/sbitmap.c') diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 7280ae8ca88c..eca462cba398 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -434,6 +434,8 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, sbq->wake_batch = sbq_calc_wake_batch(sbq, depth); atomic_set(&sbq->wake_index, 0); atomic_set(&sbq->ws_active, 0); + atomic_set(&sbq->completion_cnt, 0); + atomic_set(&sbq->wakeup_cnt, 0); sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); if (!sbq->ws) { @@ -441,40 +443,21 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, return -ENOMEM; } - for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + for (i = 0; i < SBQ_WAIT_QUEUES; i++) init_waitqueue_head(&sbq->ws[i].wait); - atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch); - } return 0; } EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); -static inline void __sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, - unsigned int wake_batch) -{ - int i; - - if (sbq->wake_batch != wake_batch) { - WRITE_ONCE(sbq->wake_batch, wake_batch); - /* - * Pairs with the memory barrier in sbitmap_queue_wake_up() - * to ensure that the batch size is updated before the wait - * counts. - */ - smp_mb(); - for (i = 0; i < SBQ_WAIT_QUEUES; i++) - atomic_set(&sbq->ws[i].wait_cnt, 1); - } -} - static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, unsigned int depth) { unsigned int wake_batch; wake_batch = sbq_calc_wake_batch(sbq, depth); - __sbitmap_queue_update_wake_batch(sbq, wake_batch); + if (sbq->wake_batch != wake_batch) + WRITE_ONCE(sbq->wake_batch, wake_batch); } void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, @@ -488,7 +471,8 @@ void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, wake_batch = clamp_val(depth / SBQ_WAIT_QUEUES, min_batch, SBQ_WAKE_BATCH); - __sbitmap_queue_update_wake_batch(sbq, wake_batch); + + WRITE_ONCE(sbq->wake_batch, wake_batch); } EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch); @@ -587,7 +571,7 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[wake_index]; - if (waitqueue_active(&ws->wait) && atomic_read(&ws->wait_cnt)) { + if (waitqueue_active(&ws->wait)) { if (wake_index != atomic_read(&sbq->wake_index)) atomic_set(&sbq->wake_index, wake_index); return ws; @@ -599,83 +583,31 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) return NULL; } -static bool __sbq_wake_up(struct sbitmap_queue *sbq, int *nr) +void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) { - struct sbq_wait_state *ws; - unsigned int wake_batch; - int wait_cnt, cur, sub; - bool ret; + unsigned int wake_batch = READ_ONCE(sbq->wake_batch); + struct sbq_wait_state *ws = NULL; + unsigned int wakeups; - if (*nr <= 0) - return false; + if (!atomic_read(&sbq->ws_active)) + return; - ws = sbq_wake_ptr(sbq); - if (!ws) - return false; + atomic_add(nr, &sbq->completion_cnt); + wakeups = atomic_read(&sbq->wakeup_cnt); - cur = atomic_read(&ws->wait_cnt); do { - /* - * For concurrent callers of this, callers should call this - * function again to wakeup a new batch on a different 'ws'. - */ - if (cur == 0) - return true; - sub = min(*nr, cur); - wait_cnt = cur - sub; - } while (!atomic_try_cmpxchg(&ws->wait_cnt, &cur, wait_cnt)); - - /* - * If we decremented queue without waiters, retry to avoid lost - * wakeups. - */ - if (wait_cnt > 0) - return !waitqueue_active(&ws->wait); + if (atomic_read(&sbq->completion_cnt) - wakeups < wake_batch) + return; - *nr -= sub; - - /* - * When wait_cnt == 0, we have to be particularly careful as we are - * responsible to reset wait_cnt regardless whether we've actually - * woken up anybody. But in case we didn't wakeup anybody, we still - * need to retry. - */ - ret = !waitqueue_active(&ws->wait); - wake_batch = READ_ONCE(sbq->wake_batch); + if (!ws) { + ws = sbq_wake_ptr(sbq); + if (!ws) + return; + } + } while (!atomic_try_cmpxchg(&sbq->wakeup_cnt, + &wakeups, wakeups + wake_batch)); - /* - * Wake up first in case that concurrent callers decrease wait_cnt - * while waitqueue is empty. - */ wake_up_nr(&ws->wait, wake_batch); - - /* - * Pairs with the memory barrier in sbitmap_queue_resize() to - * ensure that we see the batch size update before the wait - * count is reset. - * - * Also pairs with the implicit barrier between decrementing wait_cnt - * and checking for waitqueue_active() to make sure waitqueue_active() - * sees result of the wakeup if atomic_dec_return() has seen the result - * of atomic_set(). - */ - smp_mb__before_atomic(); - - /* - * Increase wake_index before updating wait_cnt, otherwise concurrent - * callers can see valid wait_cnt in old waitqueue, which can cause - * invalid wakeup on the old waitqueue. - */ - sbq_index_atomic_inc(&sbq->wake_index); - atomic_set(&ws->wait_cnt, wake_batch); - - return ret || *nr; -} - -void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) -{ - while (__sbq_wake_up(sbq, &nr)) - ; } EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); @@ -792,9 +724,7 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m) seq_puts(m, "ws={\n"); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[i]; - - seq_printf(m, "\t{.wait_cnt=%d, .wait=%s},\n", - atomic_read(&ws->wait_cnt), + seq_printf(m, "\t{.wait=%s},\n", waitqueue_active(&ws->wait) ? "active" : "inactive"); } seq_puts(m, "}\n"); -- cgit From 976570b4ecd30d3ec6e1b0910da8e5edc591f2b6 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Tue, 15 Nov 2022 17:45:51 -0500 Subject: sbitmap: Advance the queue index before waking up a queue When a queue is awaken, the wake_index written by sbq_wake_ptr currently keeps pointing to the same queue. On the next wake up, it will thus retry the same queue, which is unfair to other queues, and can lead to starvation. This patch, moves the index update to happen before the queue is returned, such that it will now try a different queue first on the next wake up, improving fairness. Fixes: 4f8126bb2308 ("sbitmap: Use single per-bitmap counting to wake up queued tags") Reported-by: Jan Kara Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20221115224553.23594-2-krisman@suse.de Signed-off-by: Jens Axboe --- lib/sbitmap.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'lib/sbitmap.c') diff --git a/lib/sbitmap.c b/lib/sbitmap.c index eca462cba398..bea7984f7987 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -571,13 +571,19 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[wake_index]; + /* + * Advance the index before checking the current queue. + * It improves fairness, by ensuring the queue doesn't + * need to be fully emptied before trying to wake up + * from the next one. + */ + wake_index = sbq_index_inc(wake_index); + if (waitqueue_active(&ws->wait)) { if (wake_index != atomic_read(&sbq->wake_index)) atomic_set(&sbq->wake_index, wake_index); return ws; } - - wake_index = sbq_index_inc(wake_index); } return NULL; -- cgit From 26edb30dd1c0c9be11fa676b4f330ada7b794ba6 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Tue, 15 Nov 2022 17:45:53 -0500 Subject: sbitmap: Try each queue to wake up at least one waiter Jan reported the new algorithm as merged might be problematic if the queue being awaken becomes empty between the waitqueue_active inside sbq_wake_ptr check and the wake up. If that happens, wake_up_nr will not wake up any waiter and we loose too many wake ups. In order to guarantee progress, we need to wake up at least one waiter here, if there are any. This now requires trying to wake up from every queue. Instead of walking through all the queues with sbq_wake_ptr, this call moves the wake up inside that function. In a previous version of the patch, I found that updating wake_index several times when walking through queues had a measurable overhead. This ensures we only update it once, at the end. Fixes: 4f8126bb2308 ("sbitmap: Use single per-bitmap counting to wake up queued tags") Reported-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20221115224553.23594-4-krisman@suse.de Signed-off-by: Jens Axboe --- lib/sbitmap.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'lib/sbitmap.c') diff --git a/lib/sbitmap.c b/lib/sbitmap.c index bea7984f7987..586deb333237 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -560,12 +560,12 @@ void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq, } EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth); -static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) +static void __sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) { int i, wake_index; if (!atomic_read(&sbq->ws_active)) - return NULL; + return; wake_index = atomic_read(&sbq->wake_index); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { @@ -579,20 +579,22 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) */ wake_index = sbq_index_inc(wake_index); - if (waitqueue_active(&ws->wait)) { - if (wake_index != atomic_read(&sbq->wake_index)) - atomic_set(&sbq->wake_index, wake_index); - return ws; - } + /* + * It is sufficient to wake up at least one waiter to + * guarantee forward progress. + */ + if (waitqueue_active(&ws->wait) && + wake_up_nr(&ws->wait, nr)) + break; } - return NULL; + if (wake_index != atomic_read(&sbq->wake_index)) + atomic_set(&sbq->wake_index, wake_index); } void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) { unsigned int wake_batch = READ_ONCE(sbq->wake_batch); - struct sbq_wait_state *ws = NULL; unsigned int wakeups; if (!atomic_read(&sbq->ws_active)) @@ -604,16 +606,10 @@ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) do { if (atomic_read(&sbq->completion_cnt) - wakeups < wake_batch) return; - - if (!ws) { - ws = sbq_wake_ptr(sbq); - if (!ws) - return; - } } while (!atomic_try_cmpxchg(&sbq->wakeup_cnt, &wakeups, wakeups + wake_batch)); - wake_up_nr(&ws->wait, wake_batch); + __sbitmap_queue_wake_up(sbq, wake_batch); } EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); -- cgit