diff options
| author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2023-05-01 15:20:08 -0700 | 
|---|---|---|
| committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2023-05-01 15:20:08 -0700 | 
| commit | 9a87ffc99ec8eb8d35eed7c4f816d75f5cc9662e (patch) | |
| tree | d57f3a63479a07b4e0cece029886e76e04feb984 /kernel/rcu/tree.c | |
| parent | 5dc63e56a9cf8df0b59c234a505a1653f1bdf885 (diff) | |
| parent | 53bea86b5712c7491bb3dae12e271666df0a308c (diff) | |
Merge branch 'next' into for-linus
Prepare input updates for 6.4 merge window.
Diffstat (limited to 'kernel/rcu/tree.c')
| -rw-r--r-- | kernel/rcu/tree.c | 657 | 
1 files changed, 382 insertions, 275 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cf34a961821a..8e880c09ab59 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -144,14 +144,16 @@ static int rcu_scheduler_fully_active __read_mostly;  static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,  			      unsigned long gps, unsigned long flags); -static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); -static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);  static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);  static void invoke_rcu_core(void);  static void rcu_report_exp_rdp(struct rcu_data *rdp);  static void sync_sched_exp_online_cleanup(int cpu);  static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);  static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); +static bool rcu_rdp_cpu_online(struct rcu_data *rdp); +static bool rcu_init_invoked(void); +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); +static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);  /*   * rcuc/rcub/rcuop kthread realtime priority. The "rcuop" @@ -215,27 +217,6 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);  #define PER_RCU_NODE_PERIOD 3	/* Number of grace periods between delays for debugging. */  /* - * Compute the mask of online CPUs for the specified rcu_node structure. - * This will not be stable unless the rcu_node structure's ->lock is - * held, but the bit corresponding to the current CPU will be stable - * in most contexts. - */ -static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) -{ -	return READ_ONCE(rnp->qsmaskinitnext); -} - -/* - * Is the CPU corresponding to the specified rcu_data structure online - * from RCU's perspective?  This perspective is given by that structure's - * ->qsmaskinitnext field rather than by the global cpu_online_mask. - */ -static bool rcu_rdp_cpu_online(struct rcu_data *rdp) -{ -	return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode)); -} - -/*   * Return true if an RCU grace period is in progress.  The READ_ONCE()s   * permit this function to be invoked without holding the root rcu_node   * structure's ->lock, but of course results can be subject to change. @@ -734,46 +715,6 @@ void rcu_request_urgent_qs_task(struct task_struct *t)  	smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true);  } -#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) - -/* - * Is the current CPU online as far as RCU is concerned? - * - * Disable preemption to avoid false positives that could otherwise - * happen due to the current CPU number being sampled, this task being - * preempted, its old CPU being taken offline, resuming on some other CPU, - * then determining that its old CPU is now offline. - * - * Disable checking if in an NMI handler because we cannot safely - * report errors from NMI handlers anyway.  In addition, it is OK to use - * RCU on an offline processor during initial boot, hence the check for - * rcu_scheduler_fully_active. - */ -bool rcu_lockdep_current_cpu_online(void) -{ -	struct rcu_data *rdp; -	bool ret = false; - -	if (in_nmi() || !rcu_scheduler_fully_active) -		return true; -	preempt_disable_notrace(); -	rdp = this_cpu_ptr(&rcu_data); -	/* -	 * Strictly, we care here about the case where the current CPU is -	 * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask -	 * not being up to date. So arch_spin_is_locked() might have a -	 * false positive if it's held by some *other* CPU, but that's -	 * OK because that just means a false *negative* on the warning. -	 */ -	if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock)) -		ret = true; -	preempt_enable_notrace(); -	return ret; -} -EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); - -#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ -  /*   * When trying to report a quiescent state on behalf of some other CPU,   * it is our responsibility to check for and handle potential overflow @@ -925,6 +866,24 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)  			rdp->rcu_iw_gp_seq = rnp->gp_seq;  			irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);  		} + +		if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) { +			int cpu = rdp->cpu; +			struct rcu_snap_record *rsrp; +			struct kernel_cpustat *kcsp; + +			kcsp = &kcpustat_cpu(cpu); + +			rsrp = &rdp->snap_record; +			rsrp->cputime_irq     = kcpustat_field(kcsp, CPUTIME_IRQ, cpu); +			rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu); +			rsrp->cputime_system  = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu); +			rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu); +			rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu); +			rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu); +			rsrp->jiffies = jiffies; +			rsrp->gp_seq = rdp->gp_seq; +		}  	}  	return 0; @@ -1350,13 +1309,6 @@ static void rcu_strict_gp_boundary(void *unused)  	invoke_rcu_core();  } -// Has rcu_init() been invoked?  This is used (for example) to determine -// whether spinlocks may be acquired safely. -static bool rcu_init_invoked(void) -{ -	return !!rcu_state.n_online_cpus; -} -  // Make the polled API aware of the beginning of a grace period.  static void rcu_poll_gp_seq_start(unsigned long *snap)  { @@ -2092,92 +2044,6 @@ rcu_check_quiescent_state(struct rcu_data *rdp)  }  /* - * Near the end of the offline process.  Trace the fact that this CPU - * is going offline. - */ -int rcutree_dying_cpu(unsigned int cpu) -{ -	bool blkd; -	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); -	struct rcu_node *rnp = rdp->mynode; - -	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) -		return 0; - -	blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask); -	trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), -			       blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); -	return 0; -} - -/* - * All CPUs for the specified rcu_node structure have gone offline, - * and all tasks that were preempted within an RCU read-side critical - * section while running on one of those CPUs have since exited their RCU - * read-side critical section.  Some other CPU is reporting this fact with - * the specified rcu_node structure's ->lock held and interrupts disabled. - * This function therefore goes up the tree of rcu_node structures, - * clearing the corresponding bits in the ->qsmaskinit fields.  Note that - * the leaf rcu_node structure's ->qsmaskinit field has already been - * updated. - * - * This function does check that the specified rcu_node structure has - * all CPUs offline and no blocked tasks, so it is OK to invoke it - * prematurely.  That said, invoking it after the fact will cost you - * a needless lock acquisition.  So once it has done its work, don't - * invoke it again. - */ -static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) -{ -	long mask; -	struct rcu_node *rnp = rnp_leaf; - -	raw_lockdep_assert_held_rcu_node(rnp_leaf); -	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || -	    WARN_ON_ONCE(rnp_leaf->qsmaskinit) || -	    WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf))) -		return; -	for (;;) { -		mask = rnp->grpmask; -		rnp = rnp->parent; -		if (!rnp) -			break; -		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ -		rnp->qsmaskinit &= ~mask; -		/* Between grace periods, so better already be zero! */ -		WARN_ON_ONCE(rnp->qsmask); -		if (rnp->qsmaskinit) { -			raw_spin_unlock_rcu_node(rnp); -			/* irqs remain disabled. */ -			return; -		} -		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ -	} -} - -/* - * The CPU has been completely removed, and some other CPU is reporting - * this fact from process context.  Do the remainder of the cleanup. - * There can only be one CPU hotplug operation at a time, so no need for - * explicit locking. - */ -int rcutree_dead_cpu(unsigned int cpu) -{ -	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); -	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */ - -	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) -		return 0; - -	WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); -	/* Adjust any no-longer-needed kthreads. */ -	rcu_boost_kthread_setaffinity(rnp, -1); -	// Stop-machine done, so allow nohz_full to disable tick. -	tick_dep_clear(TICK_DEP_BIT_RCU); -	return 0; -} - -/*   * Invoke any RCU callbacks that have made it to the end of their grace   * period.  Throttle as specified by rdp->blimit.   */ @@ -2209,7 +2075,7 @@ static void rcu_do_batch(struct rcu_data *rdp)  	 */  	rcu_nocb_lock_irqsave(rdp, flags);  	WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); -	pending = rcu_segcblist_n_cbs(&rdp->cblist); +	pending = rcu_segcblist_get_seglen(&rdp->cblist, RCU_DONE_TAIL);  	div = READ_ONCE(rcu_divisor);  	div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div;  	bl = max(rdp->blimit, pending >> div); @@ -2727,10 +2593,11 @@ static void check_cb_ovld(struct rcu_data *rdp)  }  static void -__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy) +__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)  {  	static atomic_t doublefrees;  	unsigned long flags; +	bool lazy;  	struct rcu_data *rdp;  	bool was_alldone; @@ -2755,6 +2622,7 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)  	kasan_record_aux_stack_noalloc(head);  	local_irq_save(flags);  	rdp = this_cpu_ptr(&rcu_data); +	lazy = lazy_in && !rcu_async_should_hurry();  	/* Add the callback to our list. */  	if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) { @@ -2876,13 +2744,15 @@ EXPORT_SYMBOL_GPL(call_rcu);  /**   * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers + * @list: List node. All blocks are linked between each other + * @gp_snap: Snapshot of RCU state for objects placed to this bulk   * @nr_records: Number of active pointers in the array - * @next: Next bulk object in the block chain   * @records: Array of the kvfree_rcu() pointers   */  struct kvfree_rcu_bulk_data { +	struct list_head list; +	unsigned long gp_snap;  	unsigned long nr_records; -	struct kvfree_rcu_bulk_data *next;  	void *records[];  }; @@ -2898,26 +2768,28 @@ struct kvfree_rcu_bulk_data {   * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests   * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period   * @head_free: List of kfree_rcu() objects waiting for a grace period - * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period + * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period   * @krcp: Pointer to @kfree_rcu_cpu structure   */  struct kfree_rcu_cpu_work {  	struct rcu_work rcu_work;  	struct rcu_head *head_free; -	struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS]; +	struct list_head bulk_head_free[FREE_N_CHANNELS];  	struct kfree_rcu_cpu *krcp;  };  /**   * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period   * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period + * @head_gp_snap: Snapshot of RCU state for objects placed to "@head" + * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period   * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period   * @lock: Synchronize access to this structure   * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES   * @initialized: The @rcu_work fields have been initialized - * @count: Number of objects for which GP not started + * @head_count: Number of objects in rcu_head singular list + * @bulk_count: Number of objects in bulk-list   * @bkvcache:   *	A simple cache list that contains objects for reuse purpose.   *	In order to save some per-cpu space the list is singular. @@ -2935,13 +2807,20 @@ struct kfree_rcu_cpu_work {   * the interactions with the slab allocators.   */  struct kfree_rcu_cpu { +	// Objects queued on a linked list +	// through their rcu_head structures.  	struct rcu_head *head; -	struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS]; +	unsigned long head_gp_snap; +	atomic_t head_count; + +	// Objects queued on a bulk-list. +	struct list_head bulk_head[FREE_N_CHANNELS]; +	atomic_t bulk_count[FREE_N_CHANNELS]; +  	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];  	raw_spinlock_t lock;  	struct delayed_work monitor_work;  	bool initialized; -	int count;  	struct delayed_work page_cache_work;  	atomic_t backoff_page_cache_fill; @@ -3029,29 +2908,87 @@ drain_page_cache(struct kfree_rcu_cpu *krcp)  	return freed;  } +static void +kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, +	struct kvfree_rcu_bulk_data *bnode, int idx) +{ +	unsigned long flags; +	int i; + +	debug_rcu_bhead_unqueue(bnode); + +	rcu_lock_acquire(&rcu_callback_map); +	if (idx == 0) { // kmalloc() / kfree(). +		trace_rcu_invoke_kfree_bulk_callback( +			rcu_state.name, bnode->nr_records, +			bnode->records); + +		kfree_bulk(bnode->nr_records, bnode->records); +	} else { // vmalloc() / vfree(). +		for (i = 0; i < bnode->nr_records; i++) { +			trace_rcu_invoke_kvfree_callback( +				rcu_state.name, bnode->records[i], 0); + +			vfree(bnode->records[i]); +		} +	} +	rcu_lock_release(&rcu_callback_map); + +	raw_spin_lock_irqsave(&krcp->lock, flags); +	if (put_cached_bnode(krcp, bnode)) +		bnode = NULL; +	raw_spin_unlock_irqrestore(&krcp->lock, flags); + +	if (bnode) +		free_page((unsigned long) bnode); + +	cond_resched_tasks_rcu_qs(); +} + +static void +kvfree_rcu_list(struct rcu_head *head) +{ +	struct rcu_head *next; + +	for (; head; head = next) { +		void *ptr = (void *) head->func; +		unsigned long offset = (void *) head - ptr; + +		next = head->next; +		debug_rcu_head_unqueue((struct rcu_head *)ptr); +		rcu_lock_acquire(&rcu_callback_map); +		trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); + +		if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) +			kvfree(ptr); + +		rcu_lock_release(&rcu_callback_map); +		cond_resched_tasks_rcu_qs(); +	} +} +  /*   * This function is invoked in workqueue context after a grace period. - * It frees all the objects queued on ->bkvhead_free or ->head_free. + * It frees all the objects queued on ->bulk_head_free or ->head_free.   */  static void kfree_rcu_work(struct work_struct *work)  {  	unsigned long flags; -	struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext; -	struct rcu_head *head, *next; +	struct kvfree_rcu_bulk_data *bnode, *n; +	struct list_head bulk_head[FREE_N_CHANNELS]; +	struct rcu_head *head;  	struct kfree_rcu_cpu *krcp;  	struct kfree_rcu_cpu_work *krwp; -	int i, j; +	int i;  	krwp = container_of(to_rcu_work(work), -			    struct kfree_rcu_cpu_work, rcu_work); +		struct kfree_rcu_cpu_work, rcu_work);  	krcp = krwp->krcp;  	raw_spin_lock_irqsave(&krcp->lock, flags);  	// Channels 1 and 2. -	for (i = 0; i < FREE_N_CHANNELS; i++) { -		bkvhead[i] = krwp->bkvhead_free[i]; -		krwp->bkvhead_free[i] = NULL; -	} +	for (i = 0; i < FREE_N_CHANNELS; i++) +		list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);  	// Channel 3.  	head = krwp->head_free; @@ -3060,39 +2997,9 @@ static void kfree_rcu_work(struct work_struct *work)  	// Handle the first two channels.  	for (i = 0; i < FREE_N_CHANNELS; i++) { -		for (; bkvhead[i]; bkvhead[i] = bnext) { -			bnext = bkvhead[i]->next; -			debug_rcu_bhead_unqueue(bkvhead[i]); - -			rcu_lock_acquire(&rcu_callback_map); -			if (i == 0) { // kmalloc() / kfree(). -				trace_rcu_invoke_kfree_bulk_callback( -					rcu_state.name, bkvhead[i]->nr_records, -					bkvhead[i]->records); - -				kfree_bulk(bkvhead[i]->nr_records, -					bkvhead[i]->records); -			} else { // vmalloc() / vfree(). -				for (j = 0; j < bkvhead[i]->nr_records; j++) { -					trace_rcu_invoke_kvfree_callback( -						rcu_state.name, -						bkvhead[i]->records[j], 0); - -					vfree(bkvhead[i]->records[j]); -				} -			} -			rcu_lock_release(&rcu_callback_map); - -			raw_spin_lock_irqsave(&krcp->lock, flags); -			if (put_cached_bnode(krcp, bkvhead[i])) -				bkvhead[i] = NULL; -			raw_spin_unlock_irqrestore(&krcp->lock, flags); - -			if (bkvhead[i]) -				free_page((unsigned long) bkvhead[i]); - -			cond_resched_tasks_rcu_qs(); -		} +		// Start from the tail page, so a GP is likely passed for it. +		list_for_each_entry_safe(bnode, n, &bulk_head[i], list) +			kvfree_rcu_bulk(krcp, bnode, i);  	}  	/* @@ -3102,21 +3009,7 @@ static void kfree_rcu_work(struct work_struct *work)  	 * queued on a linked list through their rcu_head structures.  	 * This list is named "Channel 3".  	 */ -	for (; head; head = next) { -		unsigned long offset = (unsigned long)head->func; -		void *ptr = (void *)head - offset; - -		next = head->next; -		debug_rcu_head_unqueue((struct rcu_head *)ptr); -		rcu_lock_acquire(&rcu_callback_map); -		trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); - -		if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) -			kvfree(ptr); - -		rcu_lock_release(&rcu_callback_map); -		cond_resched_tasks_rcu_qs(); -	} +	kvfree_rcu_list(head);  }  static bool @@ -3125,10 +3018,21 @@ need_offload_krc(struct kfree_rcu_cpu *krcp)  	int i;  	for (i = 0; i < FREE_N_CHANNELS; i++) -		if (krcp->bkvhead[i]) +		if (!list_empty(&krcp->bulk_head[i]))  			return true; -	return !!krcp->head; +	return !!READ_ONCE(krcp->head); +} + +static int krc_count(struct kfree_rcu_cpu *krcp) +{ +	int sum = atomic_read(&krcp->head_count); +	int i; + +	for (i = 0; i < FREE_N_CHANNELS; i++) +		sum += atomic_read(&krcp->bulk_count[i]); + +	return sum;  }  static void @@ -3136,7 +3040,7 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)  {  	long delay, delay_left; -	delay = READ_ONCE(krcp->count) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; +	delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;  	if (delayed_work_pending(&krcp->monitor_work)) {  		delay_left = krcp->monitor_work.timer.expires - jiffies;  		if (delay < delay_left) @@ -3146,6 +3050,44 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)  	queue_delayed_work(system_wq, &krcp->monitor_work, delay);  } +static void +kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) +{ +	struct list_head bulk_ready[FREE_N_CHANNELS]; +	struct kvfree_rcu_bulk_data *bnode, *n; +	struct rcu_head *head_ready = NULL; +	unsigned long flags; +	int i; + +	raw_spin_lock_irqsave(&krcp->lock, flags); +	for (i = 0; i < FREE_N_CHANNELS; i++) { +		INIT_LIST_HEAD(&bulk_ready[i]); + +		list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) { +			if (!poll_state_synchronize_rcu(bnode->gp_snap)) +				break; + +			atomic_sub(bnode->nr_records, &krcp->bulk_count[i]); +			list_move(&bnode->list, &bulk_ready[i]); +		} +	} + +	if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) { +		head_ready = krcp->head; +		atomic_set(&krcp->head_count, 0); +		WRITE_ONCE(krcp->head, NULL); +	} +	raw_spin_unlock_irqrestore(&krcp->lock, flags); + +	for (i = 0; i < FREE_N_CHANNELS; i++) { +		list_for_each_entry_safe(bnode, n, &bulk_ready[i], list) +			kvfree_rcu_bulk(krcp, bnode, i); +	} + +	if (head_ready) +		kvfree_rcu_list(head_ready); +} +  /*   * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.   */ @@ -3156,26 +3098,31 @@ static void kfree_rcu_monitor(struct work_struct *work)  	unsigned long flags;  	int i, j; +	// Drain ready for reclaim. +	kvfree_rcu_drain_ready(krcp); +  	raw_spin_lock_irqsave(&krcp->lock, flags);  	// Attempt to start a new batch.  	for (i = 0; i < KFREE_N_BATCHES; i++) {  		struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]); -		// Try to detach bkvhead or head and attach it over any +		// Try to detach bulk_head or head and attach it over any  		// available corresponding free channel. It can be that  		// a previous RCU batch is in progress, it means that  		// immediately to queue another one is not possible so  		// in that case the monitor work is rearmed. -		if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) || -			(krcp->bkvhead[1] && !krwp->bkvhead_free[1]) || -				(krcp->head && !krwp->head_free)) { +		if ((!list_empty(&krcp->bulk_head[0]) && list_empty(&krwp->bulk_head_free[0])) || +			(!list_empty(&krcp->bulk_head[1]) && list_empty(&krwp->bulk_head_free[1])) || +				(READ_ONCE(krcp->head) && !krwp->head_free)) { +  			// Channel 1 corresponds to the SLAB-pointer bulk path.  			// Channel 2 corresponds to vmalloc-pointer bulk path.  			for (j = 0; j < FREE_N_CHANNELS; j++) { -				if (!krwp->bkvhead_free[j]) { -					krwp->bkvhead_free[j] = krcp->bkvhead[j]; -					krcp->bkvhead[j] = NULL; +				if (list_empty(&krwp->bulk_head_free[j])) { +					atomic_set(&krcp->bulk_count[j], 0); +					list_replace_init(&krcp->bulk_head[j], +						&krwp->bulk_head_free[j]);  				}  			} @@ -3183,11 +3130,10 @@ static void kfree_rcu_monitor(struct work_struct *work)  			// objects queued on the linked list.  			if (!krwp->head_free) {  				krwp->head_free = krcp->head; -				krcp->head = NULL; +				atomic_set(&krcp->head_count, 0); +				WRITE_ONCE(krcp->head, NULL);  			} -			WRITE_ONCE(krcp->count, 0); -  			// One work is per one batch, so there are three  			// "free channels", the batch can handle. It can  			// be that the work is in the pending state when @@ -3197,6 +3143,8 @@ static void kfree_rcu_monitor(struct work_struct *work)  		}  	} +	raw_spin_unlock_irqrestore(&krcp->lock, flags); +  	// If there is nothing to detach, it means that our job is  	// successfully done here. In case of having at least one  	// of the channels that is still busy we should rearm the @@ -3204,8 +3152,6 @@ static void kfree_rcu_monitor(struct work_struct *work)  	// still in progress.  	if (need_offload_krc(krcp))  		schedule_delayed_monitor_work(krcp); - -	raw_spin_unlock_irqrestore(&krcp->lock, flags);  }  static enum hrtimer_restart @@ -3288,10 +3234,11 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,  		return false;  	idx = !!is_vmalloc_addr(ptr); +	bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx], +		struct kvfree_rcu_bulk_data, list);  	/* Check if a new block is required. */ -	if (!(*krcp)->bkvhead[idx] || -			(*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { +	if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {  		bnode = get_cached_bnode(*krcp);  		if (!bnode && can_alloc) {  			krc_this_cpu_unlock(*krcp, *flags); @@ -3315,17 +3262,15 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,  		if (!bnode)  			return false; -		/* Initialize the new block. */ +		// Initialize the new block and attach it.  		bnode->nr_records = 0; -		bnode->next = (*krcp)->bkvhead[idx]; - -		/* Attach it to the head. */ -		(*krcp)->bkvhead[idx] = bnode; +		list_add(&bnode->list, &(*krcp)->bulk_head[idx]);  	} -	/* Finally insert. */ -	(*krcp)->bkvhead[idx]->records -		[(*krcp)->bkvhead[idx]->nr_records++] = ptr; +	// Finally insert and update the GP for this page. +	bnode->records[bnode->nr_records++] = ptr; +	bnode->gp_snap = get_state_synchronize_rcu(); +	atomic_inc(&(*krcp)->bulk_count[idx]);  	return true;  } @@ -3342,26 +3287,21 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,   * be free'd in workqueue context. This allows us to: batch requests together to   * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.   */ -void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +void kvfree_call_rcu(struct rcu_head *head, void *ptr)  {  	unsigned long flags;  	struct kfree_rcu_cpu *krcp;  	bool success; -	void *ptr; -	if (head) { -		ptr = (void *) head - (unsigned long) func; -	} else { -		/* -		 * Please note there is a limitation for the head-less -		 * variant, that is why there is a clear rule for such -		 * objects: it can be used from might_sleep() context -		 * only. For other places please embed an rcu_head to -		 * your data. -		 */ +	/* +	 * Please note there is a limitation for the head-less +	 * variant, that is why there is a clear rule for such +	 * objects: it can be used from might_sleep() context +	 * only. For other places please embed an rcu_head to +	 * your data. +	 */ +	if (!head)  		might_sleep(); -		ptr = (unsigned long *) func; -	}  	// Queue the object but don't yet schedule the batch.  	if (debug_rcu_head_queue(ptr)) { @@ -3382,14 +3322,16 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)  			// Inline if kvfree_rcu(one_arg) call.  			goto unlock_return; -		head->func = func; +		head->func = ptr;  		head->next = krcp->head; -		krcp->head = head; +		WRITE_ONCE(krcp->head, head); +		atomic_inc(&krcp->head_count); + +		// Take a snapshot for this krcp. +		krcp->head_gp_snap = get_state_synchronize_rcu();  		success = true;  	} -	WRITE_ONCE(krcp->count, krcp->count + 1); -  	// Set timer to drain after KFREE_DRAIN_JIFFIES.  	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)  		schedule_delayed_monitor_work(krcp); @@ -3420,7 +3362,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)  	for_each_possible_cpu(cpu) {  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); -		count += READ_ONCE(krcp->count); +		count += krc_count(krcp);  		count += READ_ONCE(krcp->nr_bkv_objs);  		atomic_set(&krcp->backoff_page_cache_fill, 1);  	} @@ -3437,7 +3379,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)  		int count;  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); -		count = krcp->count; +		count = krc_count(krcp);  		count += drain_page_cache(krcp);  		kfree_rcu_monitor(&krcp->monitor_work.work); @@ -3461,15 +3403,12 @@ static struct shrinker kfree_rcu_shrinker = {  void __init kfree_rcu_scheduler_running(void)  {  	int cpu; -	unsigned long flags;  	for_each_possible_cpu(cpu) {  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); -		raw_spin_lock_irqsave(&krcp->lock, flags);  		if (need_offload_krc(krcp))  			schedule_delayed_monitor_work(krcp); -		raw_spin_unlock_irqrestore(&krcp->lock, flags);  	}  } @@ -3485,9 +3424,10 @@ void __init kfree_rcu_scheduler_running(void)   */  static int rcu_blocking_is_gp(void)  { -	if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) +	if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) { +		might_sleep();  		return false; -	might_sleep();  /* Check for RCU read-side critical section. */ +	}  	return true;  } @@ -3711,7 +3651,9 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full);   * If @false is returned, it is the caller's responsibility to invoke this   * function later on until it does return @true.  Alternatively, the caller   * can explicitly wait for a grace period, for example, by passing @oldstate - * to cond_synchronize_rcu() or by directly invoking synchronize_rcu(). + * to either cond_synchronize_rcu() or cond_synchronize_rcu_expedited() + * on the one hand or by directly invoking either synchronize_rcu() or + * synchronize_rcu_expedited() on the other.   *   * Yes, this function does not take counter wrap into account.   * But counter wrap is harmless.  If the counter wraps, we have waited for @@ -3722,6 +3664,12 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full);   * completed.  Alternatively, they can use get_completed_synchronize_rcu()   * to get a guaranteed-completed grace-period state.   * + * In addition, because oldstate compresses the grace-period state for + * both normal and expedited grace periods into a single unsigned long, + * it can miss a grace period when synchronize_rcu() runs concurrently + * with synchronize_rcu_expedited().  If this is unacceptable, please + * instead use the _full() variant of these polling APIs. + *   * This function provides the same memory-ordering guarantees that   * would be provided by a synchronize_rcu() that was invoked at the call   * to the function that provided @oldstate, and that returned at the end @@ -4080,6 +4028,155 @@ retry:  EXPORT_SYMBOL_GPL(rcu_barrier);  /* + * Compute the mask of online CPUs for the specified rcu_node structure. + * This will not be stable unless the rcu_node structure's ->lock is + * held, but the bit corresponding to the current CPU will be stable + * in most contexts. + */ +static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) +{ +	return READ_ONCE(rnp->qsmaskinitnext); +} + +/* + * Is the CPU corresponding to the specified rcu_data structure online + * from RCU's perspective?  This perspective is given by that structure's + * ->qsmaskinitnext field rather than by the global cpu_online_mask. + */ +static bool rcu_rdp_cpu_online(struct rcu_data *rdp) +{ +	return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode)); +} + +#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) + +/* + * Is the current CPU online as far as RCU is concerned? + * + * Disable preemption to avoid false positives that could otherwise + * happen due to the current CPU number being sampled, this task being + * preempted, its old CPU being taken offline, resuming on some other CPU, + * then determining that its old CPU is now offline. + * + * Disable checking if in an NMI handler because we cannot safely + * report errors from NMI handlers anyway.  In addition, it is OK to use + * RCU on an offline processor during initial boot, hence the check for + * rcu_scheduler_fully_active. + */ +bool rcu_lockdep_current_cpu_online(void) +{ +	struct rcu_data *rdp; +	bool ret = false; + +	if (in_nmi() || !rcu_scheduler_fully_active) +		return true; +	preempt_disable_notrace(); +	rdp = this_cpu_ptr(&rcu_data); +	/* +	 * Strictly, we care here about the case where the current CPU is +	 * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask +	 * not being up to date. So arch_spin_is_locked() might have a +	 * false positive if it's held by some *other* CPU, but that's +	 * OK because that just means a false *negative* on the warning. +	 */ +	if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock)) +		ret = true; +	preempt_enable_notrace(); +	return ret; +} +EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); + +#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ + +// Has rcu_init() been invoked?  This is used (for example) to determine +// whether spinlocks may be acquired safely. +static bool rcu_init_invoked(void) +{ +	return !!rcu_state.n_online_cpus; +} + +/* + * Near the end of the offline process.  Trace the fact that this CPU + * is going offline. + */ +int rcutree_dying_cpu(unsigned int cpu) +{ +	bool blkd; +	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); +	struct rcu_node *rnp = rdp->mynode; + +	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) +		return 0; + +	blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask); +	trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), +			       blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); +	return 0; +} + +/* + * All CPUs for the specified rcu_node structure have gone offline, + * and all tasks that were preempted within an RCU read-side critical + * section while running on one of those CPUs have since exited their RCU + * read-side critical section.  Some other CPU is reporting this fact with + * the specified rcu_node structure's ->lock held and interrupts disabled. + * This function therefore goes up the tree of rcu_node structures, + * clearing the corresponding bits in the ->qsmaskinit fields.  Note that + * the leaf rcu_node structure's ->qsmaskinit field has already been + * updated. + * + * This function does check that the specified rcu_node structure has + * all CPUs offline and no blocked tasks, so it is OK to invoke it + * prematurely.  That said, invoking it after the fact will cost you + * a needless lock acquisition.  So once it has done its work, don't + * invoke it again. + */ +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) +{ +	long mask; +	struct rcu_node *rnp = rnp_leaf; + +	raw_lockdep_assert_held_rcu_node(rnp_leaf); +	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || +	    WARN_ON_ONCE(rnp_leaf->qsmaskinit) || +	    WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf))) +		return; +	for (;;) { +		mask = rnp->grpmask; +		rnp = rnp->parent; +		if (!rnp) +			break; +		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ +		rnp->qsmaskinit &= ~mask; +		/* Between grace periods, so better already be zero! */ +		WARN_ON_ONCE(rnp->qsmask); +		if (rnp->qsmaskinit) { +			raw_spin_unlock_rcu_node(rnp); +			/* irqs remain disabled. */ +			return; +		} +		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ +	} +} + +/* + * The CPU has been completely removed, and some other CPU is reporting + * this fact from process context.  Do the remainder of the cleanup. + * There can only be one CPU hotplug operation at a time, so no need for + * explicit locking. + */ +int rcutree_dead_cpu(unsigned int cpu) +{ +	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) +		return 0; + +	WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); +	// Stop-machine done, so allow nohz_full to disable tick. +	tick_dep_clear(TICK_DEP_BIT_RCU); +	return 0; +} + +/*   * Propagate ->qsinitmask bits up the rcu_node tree to account for the   * first CPU in a given leaf rcu_node structure coming online.  The caller   * must hold the corresponding leaf rcu_node ->lock with interrupts @@ -4408,11 +4505,13 @@ static int rcu_pm_notify(struct notifier_block *self,  	switch (action) {  	case PM_HIBERNATION_PREPARE:  	case PM_SUSPEND_PREPARE: +		rcu_async_hurry();  		rcu_expedite_gp();  		break;  	case PM_POST_HIBERNATION:  	case PM_POST_SUSPEND:  		rcu_unexpedite_gp(); +		rcu_async_relax();  		break;  	default:  		break; @@ -4766,7 +4865,7 @@ struct workqueue_struct *rcu_gp_wq;  static void __init kfree_rcu_batch_init(void)  {  	int cpu; -	int i; +	int i, j;  	/* Clamp it to [0:100] seconds interval. */  	if (rcu_delay_page_cache_fill_msec < 0 || @@ -4786,8 +4885,14 @@ static void __init kfree_rcu_batch_init(void)  		for (i = 0; i < KFREE_N_BATCHES; i++) {  			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);  			krcp->krw_arr[i].krcp = krcp; + +			for (j = 0; j < FREE_N_CHANNELS; j++) +				INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);  		} +		for (i = 0; i < FREE_N_CHANNELS; i++) +			INIT_LIST_HEAD(&krcp->bulk_head[i]); +  		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);  		INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);  		krcp->initialized = true; @@ -4838,6 +4943,8 @@ void __init rcu_init(void)  	// Kick-start any polled grace periods that started early.  	if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1))  		(void)start_poll_synchronize_rcu_expedited(); + +	rcu_test_sync_prims();  }  #include "tree_stall.h"  | 
