From 09853fb89f6bc46727ccd325c0f6f266df51d155 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 4 Mar 2023 13:40:55 -0800 Subject: rcu: Add comment to rcu_do_batch() identifying rcuoc code path This commit adds a comment to help explain why the "else" clause of the in_serving_softirq() "if" statement does not need to enforce a time limit. The reason is that this "else" clause handles rcuoc kthreads that do not block handlers for other softirq vectors. Acked-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e880c09ab59..06cc6a6ad819 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2131,6 +2131,8 @@ static void rcu_do_batch(struct rcu_data *rdp) break; } } else { + // In rcuoc context, so no worries about depriving + // other softirq vectors of CPU cycles. local_bh_enable(); lockdep_assert_irqs_enabled(); cond_resched_tasks_rcu_qs(); -- cgit From 46103fe01b02169150f16d8d6e028217c5a7abe5 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 18 Jan 2023 15:30:14 +0800 Subject: rcu: Remove never-set needwake assignment from rcu_report_qs_rdp() The rcu_accelerate_cbs() function is invoked by rcu_report_qs_rdp() only if there is a grace period in progress that is still blocked by at least one CPU on this rcu_node structure. This means that rcu_accelerate_cbs() should never return the value true, and thus that this function should never set the needwake variable and in turn never invoke rcu_gp_kthread_wake(). This commit therefore removes the needwake variable and the invocation of rcu_gp_kthread_wake() in favor of a WARN_ON_ONCE() on the call to rcu_accelerate_cbs(). The purpose of this new WARN_ON_ONCE() is to detect situations where the system's opinion differs from ours. Signed-off-by: Zqiang Reviewed-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- kernel/rcu/tree.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e880c09ab59..e80e8f128c57 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1955,7 +1955,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp) { unsigned long flags; unsigned long mask; - bool needwake = false; bool needacc = false; struct rcu_node *rnp; @@ -1987,7 +1986,12 @@ rcu_report_qs_rdp(struct rcu_data *rdp) * NOCB kthreads have their own way to deal with that... */ if (!rcu_rdp_is_offloaded(rdp)) { - needwake = rcu_accelerate_cbs(rnp, rdp); + /* + * The current GP has not yet ended, so it + * should not be possible for rcu_accelerate_cbs() + * to return true. So complain, but don't awaken. + */ + WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp)); } else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { /* * ...but NOCB kthreads may miss or delay callbacks acceleration @@ -1999,8 +2003,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp) rcu_disable_urgency_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ - if (needwake) - rcu_gp_kthread_wake(); if (needacc) { rcu_nocb_lock_irqsave(rdp, flags); -- cgit From 7ea91307ad2dbdd15ed0b762a2d994f816039b9d Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 12 Jan 2023 10:48:29 -0800 Subject: rcu: Permit start_poll_synchronize_rcu_expedited() to be invoked early According to the commit log of the patch that added it to the kernel, start_poll_synchronize_rcu_expedited() can be invoked very early, as in long before rcu_init() has been invoked. But before rcu_init(), the rcu_data structure's ->mynode field has not yet been initialized. This means that the start_poll_synchronize_rcu_expedited() function's attempt to set the CPU's leaf rcu_node structure's ->exp_seq_poll_rq field will result in a segmentation fault. This commit therefore causes start_poll_synchronize_rcu_expedited() to set ->exp_seq_poll_rq only after rcu_init() has initialized all CPUs' rcu_data structures' ->mynode fields. It also removes the check from the rcu_init() function so that start_poll_synchronize_rcu_expedited( is unconditionally invoked. Yes, this might result in an unnecessary boot-time grace period, but this is down in the noise. Signed-off-by: Zqiang Reviewed-by: Frederic Weisbecker Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- kernel/rcu/tree.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e80e8f128c57..90d54571126a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4942,9 +4942,8 @@ void __init rcu_init(void) else qovld_calc = qovld; - // Kick-start any polled grace periods that started early. - if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1)) - (void)start_poll_synchronize_rcu_expedited(); + // Kick-start in case any polled grace periods started early. + (void)start_poll_synchronize_rcu_expedited(); rcu_test_sync_prims(); } -- cgit From 7a29fb4a4771124bc61de397dbfc1554dbbcc19c Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Fri, 6 Jan 2023 15:09:34 +0800 Subject: rcu: Avoid stack overflow due to __rcu_irq_enter_check_tick() being kprobe-ed Registering a kprobe on __rcu_irq_enter_check_tick() can cause kernel stack overflow as shown below. This issue can be reproduced by enabling CONFIG_NO_HZ_FULL and booting the kernel with argument "nohz_full=", and then giving the following commands at the shell prompt: # cd /sys/kernel/tracing/ # echo 'p:mp1 __rcu_irq_enter_check_tick' >> kprobe_events # echo 1 > events/kprobes/enable This commit therefore adds __rcu_irq_enter_check_tick() to the kprobes blacklist using NOKPROBE_SYMBOL(). Insufficient stack space to handle exception! ESR: 0x00000000f2000004 -- BRK (AArch64) FAR: 0x0000ffffccf3e510 Task stack: [0xffff80000ad30000..0xffff80000ad38000] IRQ stack: [0xffff800008050000..0xffff800008058000] Overflow stack: [0xffff089c36f9f310..0xffff089c36fa0310] CPU: 5 PID: 190 Comm: bash Not tainted 6.2.0-rc2-00320-g1f5abbd77e2c #19 Hardware name: linux,dummy-virt (DT) pstate: 400003c5 (nZcv DAIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : __rcu_irq_enter_check_tick+0x0/0x1b8 lr : ct_nmi_enter+0x11c/0x138 sp : ffff80000ad30080 x29: ffff80000ad30080 x28: ffff089c82e20000 x27: 0000000000000000 x26: 0000000000000000 x25: ffff089c02a8d100 x24: 0000000000000000 x23: 00000000400003c5 x22: 0000ffffccf3e510 x21: ffff089c36fae148 x20: ffff80000ad30120 x19: ffffa8da8fcce148 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: ffffa8da8e44ea6c x14: ffffa8da8e44e968 x13: ffffa8da8e03136c x12: 1fffe113804d6809 x11: ffff6113804d6809 x10: 0000000000000a60 x9 : dfff800000000000 x8 : ffff089c026b404f x7 : 00009eec7fb297f7 x6 : 0000000000000001 x5 : ffff80000ad30120 x4 : dfff800000000000 x3 : ffffa8da8e3016f4 x2 : 0000000000000003 x1 : 0000000000000000 x0 : 0000000000000000 Kernel panic - not syncing: kernel stack overflow CPU: 5 PID: 190 Comm: bash Not tainted 6.2.0-rc2-00320-g1f5abbd77e2c #19 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0xf8/0x108 show_stack+0x20/0x30 dump_stack_lvl+0x68/0x84 dump_stack+0x1c/0x38 panic+0x214/0x404 add_taint+0x0/0xf8 panic_bad_stack+0x144/0x160 handle_bad_stack+0x38/0x58 __bad_stack+0x78/0x7c __rcu_irq_enter_check_tick+0x0/0x1b8 arm64_enter_el1_dbg.isra.0+0x14/0x20 el1_dbg+0x2c/0x90 el1h_64_sync_handler+0xcc/0xe8 el1h_64_sync+0x64/0x68 __rcu_irq_enter_check_tick+0x0/0x1b8 arm64_enter_el1_dbg.isra.0+0x14/0x20 el1_dbg+0x2c/0x90 el1h_64_sync_handler+0xcc/0xe8 el1h_64_sync+0x64/0x68 __rcu_irq_enter_check_tick+0x0/0x1b8 arm64_enter_el1_dbg.isra.0+0x14/0x20 el1_dbg+0x2c/0x90 el1h_64_sync_handler+0xcc/0xe8 el1h_64_sync+0x64/0x68 __rcu_irq_enter_check_tick+0x0/0x1b8 [...] el1_dbg+0x2c/0x90 el1h_64_sync_handler+0xcc/0xe8 el1h_64_sync+0x64/0x68 __rcu_irq_enter_check_tick+0x0/0x1b8 arm64_enter_el1_dbg.isra.0+0x14/0x20 el1_dbg+0x2c/0x90 el1h_64_sync_handler+0xcc/0xe8 el1h_64_sync+0x64/0x68 __rcu_irq_enter_check_tick+0x0/0x1b8 arm64_enter_el1_dbg.isra.0+0x14/0x20 el1_dbg+0x2c/0x90 el1h_64_sync_handler+0xcc/0xe8 el1h_64_sync+0x64/0x68 __rcu_irq_enter_check_tick+0x0/0x1b8 el1_interrupt+0x28/0x60 el1h_64_irq_handler+0x18/0x28 el1h_64_irq+0x64/0x68 __ftrace_set_clr_event_nolock+0x98/0x198 __ftrace_set_clr_event+0x58/0x80 system_enable_write+0x144/0x178 vfs_write+0x174/0x738 ksys_write+0xd0/0x188 __arm64_sys_write+0x4c/0x60 invoke_syscall+0x64/0x180 el0_svc_common.constprop.0+0x84/0x160 do_el0_svc+0x48/0xe8 el0_svc+0x34/0xd0 el0t_64_sync_handler+0xb8/0xc0 el0t_64_sync+0x190/0x194 SMP: stopping secondary CPUs Kernel Offset: 0x28da86000000 from 0xffff800008000000 PHYS_OFFSET: 0xfffff76600000000 CPU features: 0x00000,01a00100,0000421b Memory Limit: none Acked-by: Joel Fernandes (Google) Link: https://lore.kernel.org/all/20221119040049.795065-1-zhengyejian1@huawei.com/ Fixes: aaf2bc50df1f ("rcu: Abstract out rcu_irq_enter_check_tick() from rcu_nmi_enter()") Signed-off-by: Zheng Yejian Cc: stable@vger.kernel.org Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 90d54571126a..ee27a03d7576 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -640,6 +640,7 @@ void __rcu_irq_enter_check_tick(void) } raw_spin_unlock_rcu_node(rdp->mynode); } +NOKPROBE_SYMBOL(__rcu_irq_enter_check_tick); #endif /* CONFIG_NO_HZ_FULL */ /* -- cgit From 5da7cb193db32da783a3f3e77d8b639989321d48 Mon Sep 17 00:00:00 2001 From: Ziwei Dai Date: Fri, 31 Mar 2023 20:42:09 +0800 Subject: rcu/kvfree: Avoid freeing new kfree_rcu() memory after old grace period Memory passed to kvfree_rcu() that is to be freed is tracked by a per-CPU kfree_rcu_cpu structure, which in turn contains pointers to kvfree_rcu_bulk_data structures that contain pointers to memory that has not yet been handed to RCU, along with an kfree_rcu_cpu_work structure that tracks the memory that has already been handed to RCU. These structures track three categories of memory: (1) Memory for kfree(), (2) Memory for kvfree(), and (3) Memory for both that arrived during an OOM episode. The first two categories are tracked in a cache-friendly manner involving a dynamically allocated page of pointers (the aforementioned kvfree_rcu_bulk_data structures), while the third uses a simple (but decidedly cache-unfriendly) linked list through the rcu_head structures in each block of memory. On a given CPU, these three categories are handled as a unit, with that CPU's kfree_rcu_cpu_work structure having one pointer for each of the three categories. Clearly, new memory for a given category cannot be placed in the corresponding kfree_rcu_cpu_work structure until any old memory has had its grace period elapse and thus has been removed. And the kfree_rcu_monitor() function does in fact check for this. Except that the kfree_rcu_monitor() function checks these pointers one at a time. This means that if the previous kfree_rcu() memory passed to RCU had only category 1 and the current one has only category 2, the kfree_rcu_monitor() function will send that current category-2 memory along immediately. This can result in memory being freed too soon, that is, out from under unsuspecting RCU readers. To see this, consider the following sequence of events, in which: o Task A on CPU 0 calls rcu_read_lock(), then uses "from_cset", then is preempted. o CPU 1 calls kfree_rcu(cset, rcu_head) in order to free "from_cset" after a later grace period. Except that "from_cset" is freed right after the previous grace period ended, so that "from_cset" is immediately freed. Task A resumes and references "from_cset"'s member, after which nothing good happens. In full detail: CPU 0 CPU 1 ---------------------- ---------------------- count_memcg_event_mm() |rcu_read_lock() <--- |mem_cgroup_from_task() |// css_set_ptr is the "from_cset" mentioned on CPU 1 |css_set_ptr = rcu_dereference((task)->cgroups) |// Hard irq comes, current task is scheduled out. cgroup_attach_task() |cgroup_migrate() |cgroup_migrate_execute() |css_set_move_task(task, from_cset, to_cset, true) |cgroup_move_task(task, to_cset) |rcu_assign_pointer(.., to_cset) |... |cgroup_migrate_finish() |put_css_set_locked(from_cset) |from_cset->refcount return 0 |kfree_rcu(cset, rcu_head) // free from_cset after new gp |add_ptr_to_bulk_krc_lock() |schedule_delayed_work(&krcp->monitor_work, ..) kfree_rcu_monitor() |krcp->bulk_head[0]'s work attached to krwp->bulk_head_free[] |queue_rcu_work(system_wq, &krwp->rcu_work) |if rwork->rcu.work is not in WORK_STRUCT_PENDING_BIT state, |call_rcu(&rwork->rcu, rcu_work_rcufn) <--- request new gp // There is a perious call_rcu(.., rcu_work_rcufn) // gp end, rcu_work_rcufn() is called. rcu_work_rcufn() |__queue_work(.., rwork->wq, &rwork->work); |kfree_rcu_work() |krwp->bulk_head_free[0] bulk is freed before new gp end!!! |The "from_cset" is freed before new gp end. // the task resumes some time later. |css_set_ptr->subsys[(subsys_id) <--- Caused kernel crash, because css_set_ptr is freed. This commit therefore causes kfree_rcu_monitor() to refrain from moving kfree_rcu() memory to the kfree_rcu_cpu_work structure until the RCU grace period has completed for all three categories. v2: Use helper function instead of inserted code block at kfree_rcu_monitor(). Fixes: 34c881745549 ("rcu: Support kfree_bulk() interface in kfree_rcu()") Fixes: 5f3c8d620447 ("rcu/tree: Maintain separate array for vmalloc ptrs") Reported-by: Mukesh Ojha Signed-off-by: Ziwei Dai Reviewed-by: Uladzislau Rezki (Sony) Tested-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e880c09ab59..7b95ee98a1a5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3024,6 +3024,18 @@ need_offload_krc(struct kfree_rcu_cpu *krcp) return !!READ_ONCE(krcp->head); } +static bool +need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp) +{ + int i; + + for (i = 0; i < FREE_N_CHANNELS; i++) + if (!list_empty(&krwp->bulk_head_free[i])) + return true; + + return !!krwp->head_free; +} + static int krc_count(struct kfree_rcu_cpu *krcp) { int sum = atomic_read(&krcp->head_count); @@ -3107,15 +3119,14 @@ static void kfree_rcu_monitor(struct work_struct *work) for (i = 0; i < KFREE_N_BATCHES; i++) { struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]); - // Try to detach bulk_head or head and attach it over any - // available corresponding free channel. It can be that - // a previous RCU batch is in progress, it means that - // immediately to queue another one is not possible so - // in that case the monitor work is rearmed. - if ((!list_empty(&krcp->bulk_head[0]) && list_empty(&krwp->bulk_head_free[0])) || - (!list_empty(&krcp->bulk_head[1]) && list_empty(&krwp->bulk_head_free[1])) || - (READ_ONCE(krcp->head) && !krwp->head_free)) { + // Try to detach bulk_head or head and attach it, only when + // all channels are free. Any channel is not free means at krwp + // there is on-going rcu work to handle krwp's free business. + if (need_wait_for_krwp_work(krwp)) + continue; + // kvfree_rcu_drain_ready() might handle this krcp, if so give up. + if (need_offload_krc(krcp)) { // Channel 1 corresponds to the SLAB-pointer bulk path. // Channel 2 corresponds to vmalloc-pointer bulk path. for (j = 0; j < FREE_N_CHANNELS; j++) { -- cgit