diff options
Diffstat (limited to 'kernel/rcu/tree.c')
-rw-r--r-- | kernel/rcu/tree.c | 305 |
1 files changed, 191 insertions, 114 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4374d7af2c11..739219a78b84 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2726,13 +2726,15 @@ EXPORT_SYMBOL_GPL(call_rcu); /** * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers + * @list: List node. All blocks are linked between each other + * @gp_snap: Snapshot of RCU state for objects placed to this bulk * @nr_records: Number of active pointers in the array - * @next: Next bulk object in the block chain * @records: Array of the kvfree_rcu() pointers */ struct kvfree_rcu_bulk_data { + struct list_head list; + unsigned long gp_snap; unsigned long nr_records; - struct kvfree_rcu_bulk_data *next; void *records[]; }; @@ -2748,26 +2750,28 @@ struct kvfree_rcu_bulk_data { * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period - * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period + * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure */ struct kfree_rcu_cpu_work { struct rcu_work rcu_work; struct rcu_head *head_free; - struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS]; + struct list_head bulk_head_free[FREE_N_CHANNELS]; struct kfree_rcu_cpu *krcp; }; /** * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period + * @head_gp_snap: Snapshot of RCU state for objects placed to "@head" + * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @initialized: The @rcu_work fields have been initialized - * @count: Number of objects for which GP not started + * @head_count: Number of objects in rcu_head singular list + * @bulk_count: Number of objects in bulk-list * @bkvcache: * A simple cache list that contains objects for reuse purpose. * In order to save some per-cpu space the list is singular. @@ -2785,13 +2789,20 @@ struct kfree_rcu_cpu_work { * the interactions with the slab allocators. */ struct kfree_rcu_cpu { + // Objects queued on a linked list + // through their rcu_head structures. struct rcu_head *head; - struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS]; + unsigned long head_gp_snap; + atomic_t head_count; + + // Objects queued on a bulk-list. + struct list_head bulk_head[FREE_N_CHANNELS]; + atomic_t bulk_count[FREE_N_CHANNELS]; + struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; raw_spinlock_t lock; struct delayed_work monitor_work; bool initialized; - int count; struct delayed_work page_cache_work; atomic_t backoff_page_cache_fill; @@ -2879,29 +2890,87 @@ drain_page_cache(struct kfree_rcu_cpu *krcp) return freed; } +static void +kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, + struct kvfree_rcu_bulk_data *bnode, int idx) +{ + unsigned long flags; + int i; + + debug_rcu_bhead_unqueue(bnode); + + rcu_lock_acquire(&rcu_callback_map); + if (idx == 0) { // kmalloc() / kfree(). + trace_rcu_invoke_kfree_bulk_callback( + rcu_state.name, bnode->nr_records, + bnode->records); + + kfree_bulk(bnode->nr_records, bnode->records); + } else { // vmalloc() / vfree(). + for (i = 0; i < bnode->nr_records; i++) { + trace_rcu_invoke_kvfree_callback( + rcu_state.name, bnode->records[i], 0); + + vfree(bnode->records[i]); + } + } + rcu_lock_release(&rcu_callback_map); + + raw_spin_lock_irqsave(&krcp->lock, flags); + if (put_cached_bnode(krcp, bnode)) + bnode = NULL; + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + if (bnode) + free_page((unsigned long) bnode); + + cond_resched_tasks_rcu_qs(); +} + +static void +kvfree_rcu_list(struct rcu_head *head) +{ + struct rcu_head *next; + + for (; head; head = next) { + void *ptr = (void *) head->func; + unsigned long offset = (void *) head - ptr; + + next = head->next; + debug_rcu_head_unqueue((struct rcu_head *)ptr); + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); + + if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) + kvfree(ptr); + + rcu_lock_release(&rcu_callback_map); + cond_resched_tasks_rcu_qs(); + } +} + /* * This function is invoked in workqueue context after a grace period. - * It frees all the objects queued on ->bkvhead_free or ->head_free. + * It frees all the objects queued on ->bulk_head_free or ->head_free. */ static void kfree_rcu_work(struct work_struct *work) { unsigned long flags; - struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext; - struct rcu_head *head, *next; + struct kvfree_rcu_bulk_data *bnode, *n; + struct list_head bulk_head[FREE_N_CHANNELS]; + struct rcu_head *head; struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu_work *krwp; - int i, j; + int i; krwp = container_of(to_rcu_work(work), - struct kfree_rcu_cpu_work, rcu_work); + struct kfree_rcu_cpu_work, rcu_work); krcp = krwp->krcp; raw_spin_lock_irqsave(&krcp->lock, flags); // Channels 1 and 2. - for (i = 0; i < FREE_N_CHANNELS; i++) { - bkvhead[i] = krwp->bkvhead_free[i]; - krwp->bkvhead_free[i] = NULL; - } + for (i = 0; i < FREE_N_CHANNELS; i++) + list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]); // Channel 3. head = krwp->head_free; @@ -2910,39 +2979,9 @@ static void kfree_rcu_work(struct work_struct *work) // Handle the first two channels. for (i = 0; i < FREE_N_CHANNELS; i++) { - for (; bkvhead[i]; bkvhead[i] = bnext) { - bnext = bkvhead[i]->next; - debug_rcu_bhead_unqueue(bkvhead[i]); - - rcu_lock_acquire(&rcu_callback_map); - if (i == 0) { // kmalloc() / kfree(). - trace_rcu_invoke_kfree_bulk_callback( - rcu_state.name, bkvhead[i]->nr_records, - bkvhead[i]->records); - - kfree_bulk(bkvhead[i]->nr_records, - bkvhead[i]->records); - } else { // vmalloc() / vfree(). - for (j = 0; j < bkvhead[i]->nr_records; j++) { - trace_rcu_invoke_kvfree_callback( - rcu_state.name, - bkvhead[i]->records[j], 0); - - vfree(bkvhead[i]->records[j]); - } - } - rcu_lock_release(&rcu_callback_map); - - raw_spin_lock_irqsave(&krcp->lock, flags); - if (put_cached_bnode(krcp, bkvhead[i])) - bkvhead[i] = NULL; - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - if (bkvhead[i]) - free_page((unsigned long) bkvhead[i]); - - cond_resched_tasks_rcu_qs(); - } + // Start from the tail page, so a GP is likely passed for it. + list_for_each_entry_safe(bnode, n, &bulk_head[i], list) + kvfree_rcu_bulk(krcp, bnode, i); } /* @@ -2952,21 +2991,7 @@ static void kfree_rcu_work(struct work_struct *work) * queued on a linked list through their rcu_head structures. * This list is named "Channel 3". */ - for (; head; head = next) { - unsigned long offset = (unsigned long)head->func; - void *ptr = (void *)head - offset; - - next = head->next; - debug_rcu_head_unqueue((struct rcu_head *)ptr); - rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); - - if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) - kvfree(ptr); - - rcu_lock_release(&rcu_callback_map); - cond_resched_tasks_rcu_qs(); - } + kvfree_rcu_list(head); } static bool @@ -2975,10 +3000,21 @@ need_offload_krc(struct kfree_rcu_cpu *krcp) int i; for (i = 0; i < FREE_N_CHANNELS; i++) - if (krcp->bkvhead[i]) + if (!list_empty(&krcp->bulk_head[i])) return true; - return !!krcp->head; + return !!READ_ONCE(krcp->head); +} + +static int krc_count(struct kfree_rcu_cpu *krcp) +{ + int sum = atomic_read(&krcp->head_count); + int i; + + for (i = 0; i < FREE_N_CHANNELS; i++) + sum += atomic_read(&krcp->bulk_count[i]); + + return sum; } static void @@ -2986,7 +3022,7 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) { long delay, delay_left; - delay = READ_ONCE(krcp->count) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; + delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; if (delayed_work_pending(&krcp->monitor_work)) { delay_left = krcp->monitor_work.timer.expires - jiffies; if (delay < delay_left) @@ -2996,6 +3032,44 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) queue_delayed_work(system_wq, &krcp->monitor_work, delay); } +static void +kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) +{ + struct list_head bulk_ready[FREE_N_CHANNELS]; + struct kvfree_rcu_bulk_data *bnode, *n; + struct rcu_head *head_ready = NULL; + unsigned long flags; + int i; + + raw_spin_lock_irqsave(&krcp->lock, flags); + for (i = 0; i < FREE_N_CHANNELS; i++) { + INIT_LIST_HEAD(&bulk_ready[i]); + + list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) { + if (!poll_state_synchronize_rcu(bnode->gp_snap)) + break; + + atomic_sub(bnode->nr_records, &krcp->bulk_count[i]); + list_move(&bnode->list, &bulk_ready[i]); + } + } + + if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) { + head_ready = krcp->head; + atomic_set(&krcp->head_count, 0); + WRITE_ONCE(krcp->head, NULL); + } + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + for (i = 0; i < FREE_N_CHANNELS; i++) { + list_for_each_entry_safe(bnode, n, &bulk_ready[i], list) + kvfree_rcu_bulk(krcp, bnode, i); + } + + if (head_ready) + kvfree_rcu_list(head_ready); +} + /* * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. */ @@ -3006,26 +3080,31 @@ static void kfree_rcu_monitor(struct work_struct *work) unsigned long flags; int i, j; + // Drain ready for reclaim. + kvfree_rcu_drain_ready(krcp); + raw_spin_lock_irqsave(&krcp->lock, flags); // Attempt to start a new batch. for (i = 0; i < KFREE_N_BATCHES; i++) { struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]); - // Try to detach bkvhead or head and attach it over any + // Try to detach bulk_head or head and attach it over any // available corresponding free channel. It can be that // a previous RCU batch is in progress, it means that // immediately to queue another one is not possible so // in that case the monitor work is rearmed. - if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) || - (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) || - (krcp->head && !krwp->head_free)) { + if ((!list_empty(&krcp->bulk_head[0]) && list_empty(&krwp->bulk_head_free[0])) || + (!list_empty(&krcp->bulk_head[1]) && list_empty(&krwp->bulk_head_free[1])) || + (READ_ONCE(krcp->head) && !krwp->head_free)) { + // Channel 1 corresponds to the SLAB-pointer bulk path. // Channel 2 corresponds to vmalloc-pointer bulk path. for (j = 0; j < FREE_N_CHANNELS; j++) { - if (!krwp->bkvhead_free[j]) { - krwp->bkvhead_free[j] = krcp->bkvhead[j]; - krcp->bkvhead[j] = NULL; + if (list_empty(&krwp->bulk_head_free[j])) { + atomic_set(&krcp->bulk_count[j], 0); + list_replace_init(&krcp->bulk_head[j], + &krwp->bulk_head_free[j]); } } @@ -3033,11 +3112,10 @@ static void kfree_rcu_monitor(struct work_struct *work) // objects queued on the linked list. if (!krwp->head_free) { krwp->head_free = krcp->head; - krcp->head = NULL; + atomic_set(&krcp->head_count, 0); + WRITE_ONCE(krcp->head, NULL); } - WRITE_ONCE(krcp->count, 0); - // One work is per one batch, so there are three // "free channels", the batch can handle. It can // be that the work is in the pending state when @@ -3047,6 +3125,8 @@ static void kfree_rcu_monitor(struct work_struct *work) } } + raw_spin_unlock_irqrestore(&krcp->lock, flags); + // If there is nothing to detach, it means that our job is // successfully done here. In case of having at least one // of the channels that is still busy we should rearm the @@ -3054,8 +3134,6 @@ static void kfree_rcu_monitor(struct work_struct *work) // still in progress. if (need_offload_krc(krcp)) schedule_delayed_monitor_work(krcp); - - raw_spin_unlock_irqrestore(&krcp->lock, flags); } static enum hrtimer_restart @@ -3138,10 +3216,11 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, return false; idx = !!is_vmalloc_addr(ptr); + bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx], + struct kvfree_rcu_bulk_data, list); /* Check if a new block is required. */ - if (!(*krcp)->bkvhead[idx] || - (*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { + if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) { bnode = get_cached_bnode(*krcp); if (!bnode && can_alloc) { krc_this_cpu_unlock(*krcp, *flags); @@ -3165,17 +3244,15 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, if (!bnode) return false; - /* Initialize the new block. */ + // Initialize the new block and attach it. bnode->nr_records = 0; - bnode->next = (*krcp)->bkvhead[idx]; - - /* Attach it to the head. */ - (*krcp)->bkvhead[idx] = bnode; + list_add(&bnode->list, &(*krcp)->bulk_head[idx]); } - /* Finally insert. */ - (*krcp)->bkvhead[idx]->records - [(*krcp)->bkvhead[idx]->nr_records++] = ptr; + // Finally insert and update the GP for this page. + bnode->records[bnode->nr_records++] = ptr; + bnode->gp_snap = get_state_synchronize_rcu(); + atomic_inc(&(*krcp)->bulk_count[idx]); return true; } @@ -3192,26 +3269,21 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, * be free'd in workqueue context. This allows us to: batch requests together to * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. */ -void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +void kvfree_call_rcu(struct rcu_head *head, void *ptr) { unsigned long flags; struct kfree_rcu_cpu *krcp; bool success; - void *ptr; - if (head) { - ptr = (void *) head - (unsigned long) func; - } else { - /* - * Please note there is a limitation for the head-less - * variant, that is why there is a clear rule for such - * objects: it can be used from might_sleep() context - * only. For other places please embed an rcu_head to - * your data. - */ + /* + * Please note there is a limitation for the head-less + * variant, that is why there is a clear rule for such + * objects: it can be used from might_sleep() context + * only. For other places please embed an rcu_head to + * your data. + */ + if (!head) might_sleep(); - ptr = (unsigned long *) func; - } // Queue the object but don't yet schedule the batch. if (debug_rcu_head_queue(ptr)) { @@ -3232,14 +3304,16 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) // Inline if kvfree_rcu(one_arg) call. goto unlock_return; - head->func = func; + head->func = ptr; head->next = krcp->head; - krcp->head = head; + WRITE_ONCE(krcp->head, head); + atomic_inc(&krcp->head_count); + + // Take a snapshot for this krcp. + krcp->head_gp_snap = get_state_synchronize_rcu(); success = true; } - WRITE_ONCE(krcp->count, krcp->count + 1); - // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) schedule_delayed_monitor_work(krcp); @@ -3270,7 +3344,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - count += READ_ONCE(krcp->count); + count += krc_count(krcp); count += READ_ONCE(krcp->nr_bkv_objs); atomic_set(&krcp->backoff_page_cache_fill, 1); } @@ -3287,7 +3361,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) int count; struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - count = krcp->count; + count = krc_count(krcp); count += drain_page_cache(krcp); kfree_rcu_monitor(&krcp->monitor_work.work); @@ -3311,15 +3385,12 @@ static struct shrinker kfree_rcu_shrinker = { void __init kfree_rcu_scheduler_running(void) { int cpu; - unsigned long flags; for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - raw_spin_lock_irqsave(&krcp->lock, flags); if (need_offload_krc(krcp)) schedule_delayed_monitor_work(krcp); - raw_spin_unlock_irqrestore(&krcp->lock, flags); } } @@ -4776,7 +4847,7 @@ struct workqueue_struct *rcu_gp_wq; static void __init kfree_rcu_batch_init(void) { int cpu; - int i; + int i, j; /* Clamp it to [0:100] seconds interval. */ if (rcu_delay_page_cache_fill_msec < 0 || @@ -4796,8 +4867,14 @@ static void __init kfree_rcu_batch_init(void) for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; + + for (j = 0; j < FREE_N_CHANNELS; j++) + INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]); } + for (i = 0; i < FREE_N_CHANNELS; i++) + INIT_LIST_HEAD(&krcp->bulk_head[i]); + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); krcp->initialized = true; |