From cdfa0f6fa6b7183c062046043b649b9a91e3ac52 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Apr 2023 16:49:14 -0700 Subject: rcu/kvfree: Add debug to check grace periods This commit adds debugging checks to verify that the required RCU grace period has elapsed for each kvfree_rcu_bulk_data structure that arrives at the kvfree_rcu_bulk() function. These checks make use of that structure's ->gp_snap field, which has been upgraded from an unsigned long to an rcu_gp_oldstate structure. This upgrade reduces the chances of false positives to nearly zero, even on 32-bit systems, for which this structure carries 64 bits of state. Cc: Ziwei Dai Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f52ff7241041..91d75fd6c579 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2756,7 +2756,7 @@ EXPORT_SYMBOL_GPL(call_rcu); */ struct kvfree_rcu_bulk_data { struct list_head list; - unsigned long gp_snap; + struct rcu_gp_oldstate gp_snap; unsigned long nr_records; void *records[]; }; @@ -2921,23 +2921,24 @@ kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, int i; debug_rcu_bhead_unqueue(bnode); - - rcu_lock_acquire(&rcu_callback_map); - if (idx == 0) { // kmalloc() / kfree(). - trace_rcu_invoke_kfree_bulk_callback( - rcu_state.name, bnode->nr_records, - bnode->records); - - kfree_bulk(bnode->nr_records, bnode->records); - } else { // vmalloc() / vfree(). - for (i = 0; i < bnode->nr_records; i++) { - trace_rcu_invoke_kvfree_callback( - rcu_state.name, bnode->records[i], 0); - - vfree(bnode->records[i]); + if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) { + rcu_lock_acquire(&rcu_callback_map); + if (idx == 0) { // kmalloc() / kfree(). + trace_rcu_invoke_kfree_bulk_callback( + rcu_state.name, bnode->nr_records, + bnode->records); + + kfree_bulk(bnode->nr_records, bnode->records); + } else { // vmalloc() / vfree(). + for (i = 0; i < bnode->nr_records; i++) { + trace_rcu_invoke_kvfree_callback( + rcu_state.name, bnode->records[i], 0); + + vfree(bnode->records[i]); + } } + rcu_lock_release(&rcu_callback_map); } - rcu_lock_release(&rcu_callback_map); raw_spin_lock_irqsave(&krcp->lock, flags); if (put_cached_bnode(krcp, bnode)) @@ -3081,7 +3082,7 @@ kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) INIT_LIST_HEAD(&bulk_ready[i]); list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) { - if (!poll_state_synchronize_rcu(bnode->gp_snap)) + if (!poll_state_synchronize_rcu_full(&bnode->gp_snap)) break; atomic_sub(bnode->nr_records, &krcp->bulk_count[i]); @@ -3285,7 +3286,7 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, // Finally insert and update the GP for this page. bnode->records[bnode->nr_records++] = ptr; - bnode->gp_snap = get_state_synchronize_rcu(); + get_state_synchronize_rcu_full(&bnode->gp_snap); atomic_inc(&(*krcp)->bulk_count[idx]); return true; -- cgit From f32276a37652a9ce05db27cdfb40ac3e3fc98f9f Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 4 Apr 2023 16:13:00 +0200 Subject: rcu/kvfree: Add debug check for GP complete for kfree_rcu_cpu list Under low-memory conditions, kvfree_rcu() will use each object's rcu_head structure to queue objects in a singly linked list headed by the kfree_rcu_cpu structure's ->head field. This list is passed to call_rcu() as a unit, but there is no indication of which grace period this list needs to wait for. This in turn prevents adding debug checks in the kfree_rcu_work() as was done for the two page-of-pointers channels in the kfree_rcu_cpu structure. This commit therefore adds a ->head_free_gp_snap field to the kfree_rcu_cpu_work structure to record this grace-period number. It also adds a WARN_ON_ONCE() to kfree_rcu_monitor() that checks to make sure that the required grace period has in fact elapsed. [ paulmck: Fix kerneldoc issue raised by Stephen Rothwell. ] Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 91d75fd6c579..7452ba97ba34 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2773,6 +2773,7 @@ struct kvfree_rcu_bulk_data { * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period + * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees. * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure */ @@ -2780,6 +2781,7 @@ struct kvfree_rcu_bulk_data { struct kfree_rcu_cpu_work { struct rcu_work rcu_work; struct rcu_head *head_free; + struct rcu_gp_oldstate head_free_gp_snap; struct list_head bulk_head_free[FREE_N_CHANNELS]; struct kfree_rcu_cpu *krcp; }; @@ -2985,6 +2987,7 @@ static void kfree_rcu_work(struct work_struct *work) struct rcu_head *head; struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu_work *krwp; + struct rcu_gp_oldstate head_gp_snap; int i; krwp = container_of(to_rcu_work(work), @@ -2999,6 +3002,7 @@ static void kfree_rcu_work(struct work_struct *work) // Channel 3. head = krwp->head_free; krwp->head_free = NULL; + head_gp_snap = krwp->head_free_gp_snap; raw_spin_unlock_irqrestore(&krcp->lock, flags); // Handle the first two channels. @@ -3015,7 +3019,8 @@ static void kfree_rcu_work(struct work_struct *work) * queued on a linked list through their rcu_head structures. * This list is named "Channel 3". */ - kvfree_rcu_list(head); + if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap))) + kvfree_rcu_list(head); } static bool @@ -3147,6 +3152,7 @@ static void kfree_rcu_monitor(struct work_struct *work) // objects queued on the linked list. if (!krwp->head_free) { krwp->head_free = krcp->head; + get_state_synchronize_rcu_full(&krwp->head_free_gp_snap); atomic_set(&krcp->head_count, 0); WRITE_ONCE(krcp->head, NULL); } -- cgit From 1e237994d9c9a5ae47ae13030585a413a29469e6 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 5 Apr 2023 10:13:59 +0800 Subject: rcu/kvfree: Invoke debug_rcu_bhead_unqueue() after checking bnode->gp_snap If kvfree_rcu_bulk() sees that the required grace period has failed to elapse, it leaks the memory because readers might still be using it. But in that case, the debug-objects subsystem still marks the relevant structures as having been freed, even though they are instead being leaked. This commit fixes this mismatch by invoking debug_rcu_bhead_unqueue() only when we are actually going to free the objects. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7452ba97ba34..426f1f3bb5f2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2922,8 +2922,8 @@ kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, unsigned long flags; int i; - debug_rcu_bhead_unqueue(bnode); if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) { + debug_rcu_bhead_unqueue(bnode); rcu_lock_acquire(&rcu_callback_map); if (idx == 0) { // kmalloc() / kfree(). trace_rcu_invoke_kfree_bulk_callback( -- cgit From 309a4316507767f8078d30c9681dc76f4299b0f1 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Sat, 8 Apr 2023 22:25:30 +0800 Subject: rcu/kvfree: Use consistent krcp when growing kfree_rcu() page cache The add_ptr_to_bulk_krc_lock() function is invoked to allocate a new kfree_rcu() page, also known as a kvfree_rcu_bulk_data structure. The kfree_rcu_cpu structure's lock is used to protect this operation, except that this lock must be momentarily dropped when allocating memory. It is clearly important that the lock that is reacquired be the same lock that was acquired initially via krc_this_cpu_lock(). Unfortunately, this same krc_this_cpu_lock() function is used to re-acquire this lock, and if the task migrated to some other CPU during the memory allocation, this will result in the kvfree_rcu_bulk_data structure being added to the wrong CPU's kfree_rcu_cpu structure. This commit therefore replaces that second call to krc_this_cpu_lock() with raw_spin_lock_irqsave() in order to explicitly acquire the lock on the correct kfree_rcu_cpu structure, thus keeping things straight even when the task migrates. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 426f1f3bb5f2..51d84eabf645 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3279,7 +3279,7 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, // scenarios. bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - *krcp = krc_this_cpu_lock(flags); + raw_spin_lock_irqsave(&(*krcp)->lock, *flags); } if (!bnode) -- cgit From 021a5ff8474379cd6c23e9b0e97aa27e5ff66a8b Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 11 Apr 2023 15:13:41 +0200 Subject: rcu/kvfree: Do not run a page work if a cache is disabled By default the cache size is 5 pages per CPU, but it can be disabled at boot time by setting the rcu_min_cached_objs to zero. When that happens, the current code will uselessly set an hrtimer to schedule refilling this cache with zero pages. This commit therefore streamlines this process by simply refusing the set the hrtimer when rcu_min_cached_objs is zero. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51d84eabf645..18f592bf6dc6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3225,6 +3225,10 @@ static void fill_page_cache_func(struct work_struct *work) static void run_page_cache_worker(struct kfree_rcu_cpu *krcp) { + // If cache disabled, bail out. + if (!rcu_min_cached_objs) + return; + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && !atomic_xchg(&krcp->work_in_progress, 1)) { if (atomic_read(&krcp->backoff_page_cache_fill)) { -- cgit From 60888b77a06ea16665e4df980bb86b418253e268 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 12 Apr 2023 22:31:27 +0800 Subject: rcu/kvfree: Make fill page cache start from krcp->nr_bkv_objs When the fill_page_cache_func() function is invoked, it assumes that the cache of pages is completely empty. However, there can be some time between triggering execution of this function and its actual invocation. During this time, kfree_rcu_work() might run, and might fill in part or all of this cache of pages, thus invalidating the fill_page_cache_func() function's assumption. This will not overfill the cache because put_cached_bnode() will reject the extra page. However, it will result in a needless allocation and freeing of one extra page, which might not be helpful under lowish-memory conditions. This commit therefore causes the fill_page_cache_func() to explicitly account for pages that have been placed into the cache shortly before it starts running. Signed-off-by: Zqiang Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 18f592bf6dc6..98f2e833e217 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3201,7 +3201,7 @@ static void fill_page_cache_func(struct work_struct *work) nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ? 1 : rcu_min_cached_objs; - for (i = 0; i < nr_pages; i++) { + for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) { bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); -- cgit From 6b706e5603c44ff0b6f43c2e26e0d590e1d265f8 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Tue, 18 Apr 2023 20:27:02 +0800 Subject: rcu/kvfree: Make drain_page_cache() take early return if cache is disabled If the rcutree.rcu_min_cached_objs kernel boot parameter is set to zero, then krcp->page_cache_work will never be triggered to fill page cache. In addition, the put_cached_bnode() will not fill page cache. As a result krcp->bkvcache will always be empty, so there is no need to acquire krcp->lock to get page from krcp->bkvcache. This commit therefore makes drain_page_cache() return immediately if the rcu_min_cached_objs is zero. Signed-off-by: Zqiang Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 98f2e833e217..00ed45ddc6ca 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2902,6 +2902,9 @@ drain_page_cache(struct kfree_rcu_cpu *krcp) struct llist_node *page_list, *pos, *n; int freed = 0; + if (!rcu_min_cached_objs) + return 0; + raw_spin_lock_irqsave(&krcp->lock, flags); page_list = llist_del_all(&krcp->bkvcache); WRITE_ONCE(krcp->nr_bkv_objs, 0); -- cgit From fea1c1f0101783f24d00e065ecd3d6e90292f887 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 Mar 2023 16:43:54 -0700 Subject: rcu: Check callback-invocation time limit for rcuc kthreads Currently, a callback-invocation time limit is enforced only for callbacks invoked from the softirq environment, the rationale being that when callbacks are instead invoked from rcuc and rcuoc kthreads, these callbacks cannot be holding up other softirq vectors. Which is in fact true. However, if an rcuc kthread spends too much time invoking callbacks, it can delay quiescent-state reports from its CPU, which can also be a problem. This commit therefore applies the callback-invocation time limit to callback invocation from the rcuc kthreads as well as from softirq. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f52ff7241041..9a5c160186d1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2046,6 +2046,13 @@ rcu_check_quiescent_state(struct rcu_data *rdp) rcu_report_qs_rdp(rdp); } +/* Return true if callback-invocation time limit exceeded. */ +static bool rcu_do_batch_check_time(long count, long tlimit) +{ + // Invoke local_clock() only once per 32 consecutive callbacks. + return unlikely(tlimit) && !likely(count & 31) && local_clock() >= tlimit; +} + /* * Invoke any RCU callbacks that have made it to the end of their grace * period. Throttle as specified by rdp->blimit. @@ -2082,7 +2089,8 @@ static void rcu_do_batch(struct rcu_data *rdp) div = READ_ONCE(rcu_divisor); div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; bl = max(rdp->blimit, pending >> div); - if (in_serving_softirq() && unlikely(bl > 100)) { + if ((in_serving_softirq() || rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING) && + unlikely(bl > 100)) { long rrn = READ_ONCE(rcu_resched_ns); rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn; @@ -2126,21 +2134,23 @@ static void rcu_do_batch(struct rcu_data *rdp) * Make sure we don't spend too much time here and deprive other * softirq vectors of CPU cycles. */ - if (unlikely(tlimit)) { - /* only call local_clock() every 32 callbacks */ - if (likely((count & 31) || local_clock() < tlimit)) - continue; - /* Exceeded the time limit, so leave. */ + if (rcu_do_batch_check_time(count, tlimit)) break; - } } else { - // In rcuoc context, so no worries about depriving - // other softirq vectors of CPU cycles. + // In rcuc/rcuoc context, so no worries about + // depriving other softirq vectors of CPU cycles. local_bh_enable(); lockdep_assert_irqs_enabled(); cond_resched_tasks_rcu_qs(); lockdep_assert_irqs_enabled(); local_bh_disable(); + // But rcuc kthreads can delay quiescent-state + // reporting, so check time limits for them. + if (rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING && + rcu_do_batch_check_time(count, tlimit)) { + rdp->rcu_cpu_has_work = 1; + break; + } } } -- cgit From f51164a808b5bf1d81fc37eb53ab1eae59c79f2d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Mar 2023 09:05:56 -0700 Subject: rcu: Employ jiffies-based backstop to callback time limit Currently, if there are more than 100 ready-to-invoke RCU callbacks queued on a given CPU, the rcu_do_batch() function sets a timeout for invocation of the series. This timeout defaulting to three milliseconds, and may be adjusted using the rcutree.rcu_resched_ns kernel boot parameter. This timeout is checked using local_clock(), but the overhead of this function combined with the common-case very small callback-invocation overhead means that local_clock() is checked every 32nd invocation. This works well except for longer-than average callbacks. For example, a series of 500-microsecond-duration callbacks means that local_clock() is checked only once every 16 milliseconds, which makes it difficult to enforce a three-millisecond timeout. This commit therefore adds a Kconfig option RCU_DOUBLE_CHECK_CB_TIME that enables backup timeout checking using the coarser grained but lighter weight jiffies. If the jiffies counter detects a timeout, then local_clock() is consulted even if this is not the 32nd callback. This prevents the aforementioned 16-millisecond latency blow. Reported-by: Domas Mituzas Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9a5c160186d1..e2dbea6cee4b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2047,10 +2047,15 @@ rcu_check_quiescent_state(struct rcu_data *rdp) } /* Return true if callback-invocation time limit exceeded. */ -static bool rcu_do_batch_check_time(long count, long tlimit) +static bool rcu_do_batch_check_time(long count, long tlimit, + bool jlimit_check, unsigned long jlimit) { // Invoke local_clock() only once per 32 consecutive callbacks. - return unlikely(tlimit) && !likely(count & 31) && local_clock() >= tlimit; + return unlikely(tlimit) && + (!likely(count & 31) || + (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) && + jlimit_check && time_after(jiffies, jlimit))) && + local_clock() >= tlimit; } /* @@ -2059,13 +2064,17 @@ static bool rcu_do_batch_check_time(long count, long tlimit) */ static void rcu_do_batch(struct rcu_data *rdp) { + long bl; + long count = 0; int div; bool __maybe_unused empty; unsigned long flags; - struct rcu_head *rhp; + unsigned long jlimit; + bool jlimit_check = false; + long pending; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); - long bl, count = 0; - long pending, tlimit = 0; + struct rcu_head *rhp; + long tlimit = 0; /* If no callbacks are ready, just return. */ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { @@ -2090,11 +2099,14 @@ static void rcu_do_batch(struct rcu_data *rdp) div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; bl = max(rdp->blimit, pending >> div); if ((in_serving_softirq() || rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING) && - unlikely(bl > 100)) { + (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) || unlikely(bl > 100))) { + const long npj = NSEC_PER_SEC / HZ; long rrn = READ_ONCE(rcu_resched_ns); rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn; tlimit = local_clock() + rrn; + jlimit = jiffies + (rrn + npj + 1) / npj; + jlimit_check = true; } trace_rcu_batch_start(rcu_state.name, rcu_segcblist_n_cbs(&rdp->cblist), bl); @@ -2134,7 +2146,7 @@ static void rcu_do_batch(struct rcu_data *rdp) * Make sure we don't spend too much time here and deprive other * softirq vectors of CPU cycles. */ - if (rcu_do_batch_check_time(count, tlimit)) + if (rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) break; } else { // In rcuc/rcuoc context, so no worries about @@ -2147,7 +2159,7 @@ static void rcu_do_batch(struct rcu_data *rdp) // But rcuc kthreads can delay quiescent-state // reporting, so check time limits for them. if (rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING && - rcu_do_batch_check_time(count, tlimit)) { + rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) { rdp->rcu_cpu_has_work = 1; break; } -- cgit From a24c1aab652ebacf9ea62470a166514174c96fe1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Apr 2023 16:47:34 -0700 Subject: rcu: Mark rcu_cpu_kthread() accesses to ->rcu_cpu_has_work The rcu_data structure's ->rcu_cpu_has_work field can be modified by any CPU attempting to wake up the rcuc kthread. Therefore, this commit marks accesses to this field from the rcu_cpu_kthread() function. This data race was reported by KCSAN. Not appropriate for backporting due to failure being unlikely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e2dbea6cee4b..faa1c01d2917 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2481,12 +2481,12 @@ static void rcu_cpu_kthread(unsigned int cpu) *statusp = RCU_KTHREAD_RUNNING; local_irq_disable(); work = *workp; - *workp = 0; + WRITE_ONCE(*workp, 0); local_irq_enable(); if (work) rcu_core(); local_bh_enable(); - if (*workp == 0) { + if (!READ_ONCE(*workp)) { trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); *statusp = RCU_KTHREAD_WAITING; return; -- cgit From 15d44dfa40305da1648de4bf001e91cc63148725 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Apr 2023 10:50:47 -0700 Subject: rcu: Make rcu_cpu_starting() rely on interrupts being disabled Currently, rcu_cpu_starting() is written so that it might be invoked with interrupts enabled. However, it is always called when interrupts are disabled, either by rcu_init(), notify_cpu_starting(), or from a call point prior to the call to notify_cpu_starting(). But why bother requiring that interrupts be disabled? The purpose is to allow the rcu_data structure's ->beenonline flag to be set after all early processing has completed for the incoming CPU, thus allowing this flag to be used to determine when workqueues have been set up for the incoming CPU, while still allowing this flag to be used as a diagnostic within rcu_core(). This commit therefore makes rcu_cpu_starting() rely on interrupts being disabled. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index faa1c01d2917..23685407dcf6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4390,15 +4390,16 @@ int rcutree_offline_cpu(unsigned int cpu) * Note that this function is special in that it is invoked directly * from the incoming CPU rather than from the cpuhp_step mechanism. * This is because this function must be invoked at a precise location. + * This incoming CPU must not have enabled interrupts yet. */ void rcu_cpu_starting(unsigned int cpu) { - unsigned long flags; unsigned long mask; struct rcu_data *rdp; struct rcu_node *rnp; bool newcpu; + lockdep_assert_irqs_disabled(); rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->cpu_started) return; @@ -4406,7 +4407,6 @@ void rcu_cpu_starting(unsigned int cpu) rnp = rdp->mynode; mask = rdp->grpmask; - local_irq_save(flags); arch_spin_lock(&rcu_state.ofl_lock); rcu_dynticks_eqs_online(); raw_spin_lock(&rcu_state.barrier_lock); @@ -4425,17 +4425,16 @@ void rcu_cpu_starting(unsigned int cpu) /* An incoming CPU should never be blocking a grace period. */ if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */ /* rcu_report_qs_rnp() *really* wants some flags to restore */ - unsigned long flags2; + unsigned long flags; - local_irq_save(flags2); + local_irq_save(flags); rcu_disable_urgency_upon_qs(rdp); /* Report QS -after- changing ->qsmaskinitnext! */ - rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags2); + rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); } else { raw_spin_unlock_rcu_node(rnp); } arch_spin_unlock(&rcu_state.ofl_lock); - local_irq_restore(flags); smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } -- cgit From 401b0de3ae4fa49d1014c8941e26d9a25f37e7cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 26 Apr 2023 11:11:29 -0700 Subject: rcu-tasks: Stop rcu_tasks_invoke_cbs() from using never-onlined CPUs The rcu_tasks_invoke_cbs() function relies on queue_work_on() to silently fall back to WORK_CPU_UNBOUND when the specified CPU is offline. However, the queue_work_on() function's silent fallback mechanism relies on that CPU having been online at some time in the past. When queue_work_on() is passed a CPU that has never been online, workqueue lockups ensue, which can be bad for your kernel's general health and well-being. This commit therefore checks whether a given CPU has ever been online, and, if not substitutes WORK_CPU_UNBOUND in the subsequent call to queue_work_on(). Why not simply omit the queue_work_on() call entirely? Because this function is flooding callback-invocation notifications to all CPUs, and must deal with possibilities that include a sparse cpu_possible_mask. This commit also moves the setting of the rcu_data structure's ->beenonline field to rcu_cpu_starting(), which executes on the incoming CPU before that CPU has ever enabled interrupts. This ensures that the required workqueues are present. In addition, because the incoming CPU has not yet enabled its interrupts, there cannot yet have been any softirq handlers running on this CPU, which means that the WARN_ON_ONCE(!rdp->beenonline) within the RCU_SOFTIRQ handler cannot have triggered yet. Fixes: d363f833c6d88 ("rcu-tasks: Use workqueues for multiple rcu_tasks_invoke_cbs() invocations") Reported-by: Tejun Heo Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 23685407dcf6..54963f8c244c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4305,7 +4305,6 @@ int rcutree_prepare_cpu(unsigned int cpu) */ rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - rdp->beenonline = true; /* We have now been online. */ rdp->gp_seq = READ_ONCE(rnp->gp_seq); rdp->gp_seq_needed = rdp->gp_seq; rdp->cpu_no_qs.b.norm = true; @@ -4332,6 +4331,16 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); } +/* + * Has the specified (known valid) CPU ever been fully online? + */ +bool rcu_cpu_beenfullyonline(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + return smp_load_acquire(&rdp->beenonline); +} + /* * Near the end of the CPU-online process. Pretty much all services * enabled, and the CPU is now very much alive. @@ -4435,6 +4444,7 @@ void rcu_cpu_starting(unsigned int cpu) raw_spin_unlock_rcu_node(rnp); } arch_spin_unlock(&rcu_state.ofl_lock); + smp_store_release(&rdp->beenonline, true); smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } -- cgit