From 3f237a79ddeea34dda67e9eedece3a22918df75e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 21:15:30 +0930 Subject: cpumask: use new operators in kernel/trace Signed-off-by: Rusty Russell LKML-Reference: <200906122115.30787.rusty@rustcorp.com.au> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2e642b2b7253..9c31c9f6b93f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3105,7 +3105,7 @@ static int rb_cpu_notify(struct notifier_block *self, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - if (cpu_isset(cpu, *buffer->cpumask)) + if (cpumask_test_cpu(cpu, buffer->cpumask)) return NOTIFY_OK; buffer->buffers[cpu] = @@ -3116,7 +3116,7 @@ static int rb_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } smp_wmb(); - cpu_set(cpu, *buffer->cpumask); + cpumask_set_cpu(cpu, buffer->cpumask); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: -- cgit From c7b0930857e2278f2e7714db6294e94c57f623b0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 11 Jun 2009 11:12:00 -0400 Subject: ring-buffer: prevent adding write in discarded area This a very tight race where an interrupt could come in and not have enough data to put into the end of a buffer page, and that it would fail to write and need to go to the next page. But if this happened when another writer was about to reserver their data, and that writer has smaller data to reserve, then it could succeed even though the interrupt moved the tail page. To pervent that, if we fail to store data, and by subtracting the amount we reserved we still have room for smaller data, we need to fill that space with "discarded" data. [ Impact: prevent race were buffer data may be lost ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 68 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 12 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9c31c9f6b93f..dbc0f93396aa 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -205,6 +205,7 @@ EXPORT_SYMBOL_GPL(tracing_is_on); #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) +#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -1170,6 +1171,59 @@ static unsigned rb_calculate_event_length(unsigned length) return length; } +static inline void +rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + unsigned long tail, unsigned long length) +{ + struct ring_buffer_event *event; + + /* + * Only the event that crossed the page boundary + * must fill the old tail_page with padding. + */ + if (tail >= BUF_PAGE_SIZE) { + local_sub(length, &tail_page->write); + return; + } + + event = __rb_page_index(tail_page, tail); + + /* + * If this event is bigger than the minimum size, then + * we need to be careful that we don't subtract the + * write counter enough to allow another writer to slip + * in on this page. + * We put in a discarded commit instead, to make sure + * that this space is not used again. + * + * If we are less than the minimum size, we don't need to + * worry about it. + */ + if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) { + /* No room for any events */ + + /* Mark the rest of the page with padding */ + rb_event_set_padding(event); + + /* Set the write back to the previous setting */ + local_sub(length, &tail_page->write); + return; + } + + /* Put in a discarded event */ + event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + event->time_delta = 1; + /* Account for this as an entry */ + local_inc(&tail_page->entries); + local_inc(&cpu_buffer->entries); + + /* Set write to end of buffer */ + length = (tail + length) - BUF_PAGE_SIZE; + local_sub(length, &tail_page->write); +} static struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, @@ -1264,17 +1318,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, cpu_buffer->tail_page->page->time_stamp = *ts; } - /* - * The actual tail page has moved forward. - */ - if (tail < BUF_PAGE_SIZE) { - /* Mark the rest of the page with padding */ - event = __rb_page_index(tail_page, tail); - rb_event_set_padding(event); - } - - /* Set the write back to the previous setting */ - local_sub(length, &tail_page->write); + rb_reset_tail(cpu_buffer, tail_page, tail, length); /* * If this was a commit entry that failed, @@ -1293,7 +1337,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, out_reset: /* reset write */ - local_sub(length, &tail_page->write); + rb_reset_tail(cpu_buffer, tail_page, tail, length); if (likely(lock_taken)) __raw_spin_unlock(&cpu_buffer->lock); -- cgit From 263294f3e1e883b9dcbf0c09a54b644918f7729d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 16 Jun 2009 11:50:18 -0400 Subject: ring-buffer: remove unused variable Fix the compiler error: kernel/trace/ring_buffer.c: In function 'rb_move_tail': kernel/trace/ring_buffer.c:1236: warning: unused variable 'event' Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index dbc0f93396aa..e857e9443987 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1233,7 +1233,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, { struct buffer_page *next_page, *head_page, *reader_page; struct ring_buffer *buffer = cpu_buffer->buffer; - struct ring_buffer_event *event; bool lock_taken = false; unsigned long flags; -- cgit From fa7439531dee58277748c819785a44d3203c4b51 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 16 Jun 2009 12:37:57 -0400 Subject: ring-buffer: use commit counters for commit pointer accounting The ring buffer is made up of three sets of pointers. The head page pointer, which points to the next page for the reader to get. The commit pointer and commit index, which points to the page and index of the last committed write respectively. The tail pointer and tail index, which points to the page and the index of the last reserved data respectively (non committed). The commit pointer is only moved forward by the outer most writer. If a nested writer comes in, it will not move the pointer forward. The current implementation has a flaw. It assumes that the outer most writer successfully reserved data. There's a small race window where the outer most writer could find the tail pointer, but a nested writer could come in (via interrupt) and move the tail forward, and even the commit forward. The outer writer would not realized the commit moved forward and the accounting will break. This patch changes the design to use counters in the per cpu buffers to keep track of commits. The counters are incremented at the start of the commit, and decremented at the end. If the end commit counter is 1, then it moves the commit pointers. A loop is made to check for races between checking and moving the commit pointers. Only the outer commit should move the pointers anyway. The test of knowing if a reserve is equal to the last commit update is still needed to know for time keeping. The time code is much less racey than the commit updates. This change not only solves the mentioned race, but also makes the code simpler. [ Impact: fix commit race and simplify code ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 154 ++++++++++++++++++++++----------------------- 1 file changed, 75 insertions(+), 79 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e857e9443987..ed3559944fcf 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -415,6 +415,8 @@ struct ring_buffer_per_cpu { unsigned long overrun; unsigned long read; local_t entries; + local_t committing; + local_t commits; u64 write_stamp; u64 read_stamp; atomic_t record_disabled; @@ -1015,8 +1017,8 @@ rb_event_index(struct ring_buffer_event *event) } static inline int -rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) +rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; unsigned long index; @@ -1028,31 +1030,6 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, rb_commit_index(cpu_buffer) == index; } -static void -rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - unsigned long index; - - index = rb_event_index(event); - addr &= PAGE_MASK; - - while (cpu_buffer->commit_page->page != (void *)addr) { - if (RB_WARN_ON(cpu_buffer, - cpu_buffer->commit_page == cpu_buffer->tail_page)) - return; - cpu_buffer->commit_page->page->commit = - cpu_buffer->commit_page->write; - rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - } - - /* Now set the commit to the event's index */ - local_set(&cpu_buffer->commit_page->page->commit, index); -} - static void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { @@ -1319,15 +1296,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, rb_reset_tail(cpu_buffer, tail_page, tail, length); - /* - * If this was a commit entry that failed, - * increment that too - */ - if (tail_page == cpu_buffer->commit_page && - tail == rb_commit_index(cpu_buffer)) { - rb_set_commit_to_write(cpu_buffer); - } - __raw_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); @@ -1377,11 +1345,11 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, local_inc(&tail_page->entries); /* - * If this is a commit and the tail is zero, then update - * this page's time stamp. + * If this is the first commit on the page, then update + * its timestamp. */ - if (!tail && rb_is_commit(cpu_buffer, event)) - cpu_buffer->commit_page->page->time_stamp = *ts; + if (!tail) + tail_page->page->time_stamp = *ts; return event; } @@ -1450,16 +1418,16 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, return -EAGAIN; /* Only a commited time event can update the write stamp */ - if (rb_is_commit(cpu_buffer, event)) { + if (rb_event_is_commit(cpu_buffer, event)) { /* - * If this is the first on the page, then we need to - * update the page itself, and just put in a zero. + * If this is the first on the page, then it was + * updated with the page itself. Try to discard it + * and if we can't just make it zero. */ if (rb_event_index(event)) { event->time_delta = *delta & TS_MASK; event->array[0] = *delta >> TS_SHIFT; } else { - cpu_buffer->commit_page->page->time_stamp = *ts; /* try to discard, since we do not need this */ if (!rb_try_to_discard(cpu_buffer, event)) { /* nope, just zero it */ @@ -1485,6 +1453,44 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, return ret; } +static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + local_inc(&cpu_buffer->committing); + local_inc(&cpu_buffer->commits); +} + +static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long commits; + + if (RB_WARN_ON(cpu_buffer, + !local_read(&cpu_buffer->committing))) + return; + + again: + commits = local_read(&cpu_buffer->commits); + /* synchronize with interrupts */ + barrier(); + if (local_read(&cpu_buffer->committing) == 1) + rb_set_commit_to_write(cpu_buffer); + + local_dec(&cpu_buffer->committing); + + /* synchronize with interrupts */ + barrier(); + + /* + * Need to account for interrupts coming in between the + * updating of the commit page and the clearing of the + * committing counter. + */ + if (unlikely(local_read(&cpu_buffer->commits) != commits) && + !local_read(&cpu_buffer->committing)) { + local_inc(&cpu_buffer->committing); + goto again; + } +} + static struct ring_buffer_event * rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, unsigned long length) @@ -1494,6 +1500,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, int commit = 0; int nr_loops = 0; + rb_start_commit(cpu_buffer); + length = rb_calculate_event_length(length); again: /* @@ -1506,7 +1514,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, * Bail! */ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) - return NULL; + goto out_fail; ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); @@ -1537,7 +1545,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); if (commit == -EBUSY) - return NULL; + goto out_fail; if (commit == -EAGAIN) goto again; @@ -1551,28 +1559,19 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; - if (!event) { - if (unlikely(commit)) - /* - * Ouch! We needed a timestamp and it was commited. But - * we didn't get our event reserved. - */ - rb_set_commit_to_write(cpu_buffer); - return NULL; - } + if (!event) + goto out_fail; - /* - * If the timestamp was commited, make the commit our entry - * now so that we will update it when needed. - */ - if (unlikely(commit)) - rb_set_commit_event(cpu_buffer, event); - else if (!rb_is_commit(cpu_buffer, event)) + if (!rb_event_is_commit(cpu_buffer, event)) delta = 0; event->time_delta = delta; return event; + + out_fail: + rb_end_commit(cpu_buffer); + return NULL; } #define TRACE_RECURSIVE_DEPTH 16 @@ -1682,13 +1681,14 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, { local_inc(&cpu_buffer->entries); - /* Only process further if we own the commit */ - if (!rb_is_commit(cpu_buffer, event)) - return; - - cpu_buffer->write_stamp += event->time_delta; + /* + * The event first in the commit queue updates the + * time stamp. + */ + if (rb_event_is_commit(cpu_buffer, event)) + cpu_buffer->write_stamp += event->time_delta; - rb_set_commit_to_write(cpu_buffer); + rb_end_commit(cpu_buffer); } /** @@ -1777,15 +1777,15 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, /* The event is discarded regardless */ rb_event_discard(event); + cpu = smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + /* * This must only be called if the event has not been * committed yet. Thus we can assume that preemption * is still disabled. */ - RB_WARN_ON(buffer, preemptible()); - - cpu = smp_processor_id(); - cpu_buffer = buffer->buffers[cpu]; + RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); if (!rb_try_to_discard(cpu_buffer, event)) goto out; @@ -1796,13 +1796,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, */ local_inc(&cpu_buffer->entries); out: - /* - * If a write came in and pushed the tail page - * we still need to update the commit pointer - * if we were the commit. - */ - if (rb_is_commit(cpu_buffer, event)) - rb_set_commit_to_write(cpu_buffer); + rb_end_commit(cpu_buffer); trace_recursive_unlock(); @@ -2720,6 +2714,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->overrun = 0; cpu_buffer->read = 0; local_set(&cpu_buffer->entries, 0); + local_set(&cpu_buffer->committing, 0); + local_set(&cpu_buffer->commits, 0); cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; -- cgit From 22f470f8daea64bc03be1fe30c8c5df382295386 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 11 Jun 2009 09:29:58 -0400 Subject: ring-buffer: use BUF_PAGE_HDR_SIZE in calculating index The index of the event is found by masking PAGE_MASK to it and subtracting the header size. Currently the header size is calculate by PAGE_SIZE - BUF_PAGE_SIZE, when we already have a macro BUF_PAGE_HDR_SIZE to define it. If we want to change BUF_PAGE_SIZE to something less than filling the rest of the page (this is done for debugging), then we break the algorithm to find the index. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ed3559944fcf..6b17a11e42a2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1013,7 +1013,7 @@ rb_event_index(struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; - return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); + return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; } static inline int -- cgit From c6a9d7b55e2df63de012a9a285bf2a0bee8e4d59 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 11 Jun 2009 09:49:15 -0400 Subject: ring-buffer: remove useless warn on check A check if "write > BUF_PAGE_SIZE" is done right after a if (write > BUF_PAGE_SIZE) return ...; Thus the check is actually testing the compiler and not the kernel. This is useless, remove it. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6b17a11e42a2..6cf340e1a4a3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1334,9 +1334,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* We reserved something on the buffer */ - if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) - return NULL; - event = __rb_page_index(tail_page, tail); rb_update_event(event, type, length); -- cgit From 0dcd4d6c3e5b17ccf88d41cb354bb4d57cb18cbf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 17 Jun 2009 14:03:44 -0400 Subject: ring-buffer: remove useless compile check for buffer_page size The original version of the ring buffer had a hack to map the page struct that held the pages of the buffer to also be the structure that the ring buffer would keep the pages in a link list. This overlap of the page struct was very dangerous and that hack was removed a while ago. But there was a check to make sure the buffer_page never became bigger than the page struct, and would fail the compile if it did. The check was only meaningful when we had the hack. Now that we have separate allocated descriptors for the buffer pages, we can remove this check. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6cf340e1a4a3..162da2305cbc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -620,12 +620,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) kfree(cpu_buffer); } -/* - * Causes compile errors if the struct buffer_page gets bigger - * than the struct page. - */ -extern int ring_buffer_page_too_big(void); - #ifdef CONFIG_HOTPLUG_CPU static int rb_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu); @@ -648,11 +642,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, int bsize; int cpu; - /* Paranoid! Optimizes out when all is well */ - if (sizeof(struct buffer_page) > sizeof(struct page)) - ring_buffer_page_too_big(); - - /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), GFP_KERNEL); -- cgit From 5f78abeebbf0a80975d719e11374535ca15396cb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 17 Jun 2009 14:11:10 -0400 Subject: ring-buffer: check for less than two in size allocation The ring buffer must have at least two pages allocated for the reader page swap to work. The page count check will miss the case of a zero size passed in. Even though a zero size ring buffer would probably fail an allocation, making the min size check for less than two instead of equal to one makes the code a bit more robust. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 162da2305cbc..2e99dba6dc48 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -657,8 +657,8 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, buffer->reader_lock_key = key; /* need at least two pages */ - if (buffer->pages == 1) - buffer->pages++; + if (buffer->pages < 2) + buffer->pages = 2; /* * In case of non-hotplug cpu, if the ring-buffer is allocated -- cgit From d47882078f05c2cb46b85f1e12a58ed9315b9d63 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 17 Jun 2009 00:39:43 -0400 Subject: ring-buffer: add locks around rb_per_cpu_empty The checking of whether the buffer is empty or not needs to be serialized among the readers. Add the reader spin lock around it. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2e99dba6dc48..969f7cbe8e93 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2756,12 +2756,17 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset); int ring_buffer_empty(struct ring_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; int cpu; + int ret; /* yes this is racy, but if you don't like the race, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - if (!rb_per_cpu_empty(cpu_buffer)) + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + ret = rb_per_cpu_empty(cpu_buffer); + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (!ret) return 0; } @@ -2777,14 +2782,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty); int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; int ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 1; cpu_buffer = buffer->buffers[cpu]; + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); ret = rb_per_cpu_empty(cpu_buffer); - + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return ret; } -- cgit From 8d707e8eb4de4b930573155ab4df4b3270ee25dd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 16 Jun 2009 21:22:48 -0400 Subject: ring-buffer: do not grab locks in nmi If ftrace_dump_on_oops is set, and an NMI detects a lockup, then it will need to read from the ring buffer. But the read side of the ring buffer still takes locks. This patch adds a check on the read side that if it is in an NMI, then it will disable the ring buffer and not take any locks. Reads can still happen on a disabled ring buffer. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 59 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 8 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 969f7cbe8e93..589b3eedfa67 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2466,6 +2466,21 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) } EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); +static inline int rb_ok_to_lock(void) +{ + /* + * If an NMI die dumps out the content of the ring buffer + * do not grab locks. We also permanently disable the ring + * buffer too. A one time deal is all you get from reading + * the ring buffer from an NMI. + */ + if (likely(!in_nmi() && !oops_in_progress)) + return 1; + + tracing_off_permanent(); + return 0; +} + /** * ring_buffer_peek - peek at the next event to be read * @buffer: The ring buffer to read @@ -2481,14 +2496,20 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; unsigned long flags; + int dolock; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; + dolock = rb_ok_to_lock(); again: - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + local_irq_save(flags); + if (dolock) + spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(buffer, cpu, ts); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (dolock) + spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); @@ -2540,6 +2561,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event = NULL; unsigned long flags; + int dolock; + + dolock = rb_ok_to_lock(); again: /* might be called in atomic */ @@ -2549,7 +2573,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) goto out; cpu_buffer = buffer->buffers[cpu]; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + local_irq_save(flags); + if (dolock) + spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(buffer, cpu, ts); if (!event) @@ -2558,7 +2584,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) rb_advance_reader(cpu_buffer); out_unlock: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (dolock) + spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); out: preempt_enable(); @@ -2757,15 +2785,23 @@ int ring_buffer_empty(struct ring_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; + int dolock; int cpu; int ret; + dolock = rb_ok_to_lock(); + /* yes this is racy, but if you don't like the race, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + local_irq_save(flags); + if (dolock) + spin_lock(&cpu_buffer->reader_lock); ret = rb_per_cpu_empty(cpu_buffer); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (dolock) + spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); + if (!ret) return 0; } @@ -2783,15 +2819,22 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; + int dolock; int ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 1; + dolock = rb_ok_to_lock(); + cpu_buffer = buffer->buffers[cpu]; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + local_irq_save(flags); + if (dolock) + spin_lock(&cpu_buffer->reader_lock); ret = rb_per_cpu_empty(cpu_buffer); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (dolock) + spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); return ret; } -- cgit From 1155de47cd66d0c496d5a6fb2223e980ef1285b2 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 25 Jun 2009 14:30:12 +0900 Subject: ring-buffer: Make it generally available In hunting down the cause for the hwlat_detector ring buffer spew in my failed -next builds it became obvious that folks are now treating ring_buffer as something that is generic independent of tracing and thus, suitable for public driver consumption. Given that there are only a few minor areas in ring_buffer that have any reliance on CONFIG_TRACING or CONFIG_FUNCTION_TRACER, provide stubs for those and make it generally available. Signed-off-by: Paul Mundt Cc: Jon Masters Cc: Steven Rostedt LKML-Reference: <20090625053012.GB19944@linux-sh.org> Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 04dac2638258..bf27bb7a63e2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1563,6 +1563,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, return NULL; } +#ifdef CONFIG_TRACING + #define TRACE_RECURSIVE_DEPTH 16 static int trace_recursive_lock(void) @@ -1593,6 +1595,13 @@ static void trace_recursive_unlock(void) current->trace_recursion--; } +#else + +#define trace_recursive_lock() (0) +#define trace_recursive_unlock() do { } while (0) + +#endif + static DEFINE_PER_CPU(int, rb_need_resched); /** @@ -3104,6 +3113,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_read_page); +#ifdef CONFIG_TRACING static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3171,6 +3181,7 @@ static __init int rb_init_debugfs(void) } fs_initcall(rb_init_debugfs); +#endif #ifdef CONFIG_HOTPLUG_CPU static int rb_cpu_notify(struct notifier_block *self, -- cgit From 3adc54fa82a68be1cd1ac82ad786ee362796e50a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 30 Mar 2009 15:32:01 -0400 Subject: ring-buffer: make the buffer a true circular link list This patch changes the ring buffer data pages from using a link list head pointer, to making each buffer page point to another buffer page and never back to a "head". This makes the handling of the ring buffer less complex, since the traversing of the ring buffer pages no longer needs to account for the head pointer. This change also is needed to make the ring buffer lockless. [ Changes in version 2: - Added change that Lai Jiangshan mentioned. From: Lai Jiangshan Date: Thu, 11 Jun 2009 11:25:48 +0800 LKML-Reference: <4A30793C.6090208@cn.fujitsu.com> I'm not sure whether these 4 lines: bpage = list_entry(pages.next, struct buffer_page, list); list_del_init(&bpage->list); cpu_buffer->pages = &bpage->list; list_splice(&pages, cpu_buffer->pages); equal to these 2 lines: cpu_buffer->pages = pages.next; list_del(&pages); If there are equivalent, I think the second one are simpler. It may be not a really necessarily cleanup. What I asked is: if there are equivalent, could you use these two line: cpu_buffer->pages = pages.next; list_del(&pages); ] [ Impact: simplify the ring buffer to help make it lockless ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 49 ++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 17 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bf27bb7a63e2..7c0168ad6d51 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -406,7 +406,7 @@ struct ring_buffer_per_cpu { spinlock_t reader_lock; /* serialize readers */ raw_spinlock_t lock; struct lock_class_key lock_key; - struct list_head pages; + struct list_head *pages; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ @@ -498,7 +498,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); */ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) { - struct list_head *head = &cpu_buffer->pages; + struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) @@ -521,12 +521,13 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) { - struct list_head *head = &cpu_buffer->pages; struct buffer_page *bpage, *tmp; unsigned long addr; LIST_HEAD(pages); unsigned i; + WARN_ON(!nr_pages); + for (i = 0; i < nr_pages; i++) { bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); @@ -541,7 +542,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_init_page(bpage->page); } - list_splice(&pages, head); + /* + * The ring buffer page list is a circular list that does not + * start and end with a list head. All page list items point to + * other pages. + */ + cpu_buffer->pages = pages.next; + list_del(&pages); rb_check_pages(cpu_buffer); @@ -573,7 +580,6 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - INIT_LIST_HEAD(&cpu_buffer->pages); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -594,7 +600,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) goto fail_free_reader; cpu_buffer->head_page - = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + = list_entry(cpu_buffer->pages, struct buffer_page, list); cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; return cpu_buffer; @@ -609,15 +615,20 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { - struct list_head *head = &cpu_buffer->pages; + struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; free_buffer_page(cpu_buffer->reader_page); - list_for_each_entry_safe(bpage, tmp, head, list) { - list_del_init(&bpage->list); + if (head) { + list_for_each_entry_safe(bpage, tmp, head, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + bpage = list_entry(head, struct buffer_page, list); free_buffer_page(bpage); } + kfree(cpu_buffer); } @@ -760,14 +771,14 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) synchronize_sched(); for (i = 0; i < nr_pages; i++) { - if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) + if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) return; - p = cpu_buffer->pages.next; + p = cpu_buffer->pages->next; bpage = list_entry(p, struct buffer_page, list); list_del_init(&bpage->list); free_buffer_page(bpage); } - if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) + if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) return; rb_reset_cpu(cpu_buffer); @@ -795,7 +806,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, p = pages->next; bpage = list_entry(p, struct buffer_page, list); list_del_init(&bpage->list); - list_add_tail(&bpage->list, &cpu_buffer->pages); + list_add_tail(&bpage->list, cpu_buffer->pages); } rb_reset_cpu(cpu_buffer); @@ -992,9 +1003,6 @@ static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, { struct list_head *p = (*bpage)->list.next; - if (p == &cpu_buffer->pages) - p = p->next; - *bpage = list_entry(p, struct buffer_page, list); } @@ -2247,6 +2255,13 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->list.next = reader->list.next; cpu_buffer->reader_page->list.prev = reader->list.prev; + /* + * cpu_buffer->pages just needs to point to the buffer, it + * has no specific buffer page to point to. Lets move it out + * of our way so we don't accidently swap it. + */ + cpu_buffer->pages = reader->list.prev; + local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); @@ -2719,7 +2734,7 @@ static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { cpu_buffer->head_page - = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + = list_entry(cpu_buffer->pages, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); local_set(&cpu_buffer->head_page->entries, 0); local_set(&cpu_buffer->head_page->page->commit, 0); -- cgit From 77ae365eca895061c8bf2b2e3ae1d9ea62869739 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 27 Mar 2009 11:00:29 -0400 Subject: ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 886 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 738 insertions(+), 148 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7c0168ad6d51..e648ba4f70e0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -322,6 +322,14 @@ struct buffer_data_page { unsigned char data[]; /* data of buffer page */ }; +/* + * Note, the buffer_page list must be first. The buffer pages + * are allocated in cache lines, which means that each buffer + * page will be at the beginning of a cache line, and thus + * the least significant bits will be zero. We use this to + * add flags in the list struct pointers, to make the ring buffer + * lockless. + */ struct buffer_page { struct list_head list; /* list of buffer pages */ local_t write; /* index for next write */ @@ -330,6 +338,21 @@ struct buffer_page { struct buffer_data_page *page; /* Actual data page */ }; +/* + * The buffer page counters, write and entries, must be reset + * atomically when crossing page boundaries. To synchronize this + * update, two counters are inserted into the number. One is + * the actual counter for the write position or count on the page. + * + * The other is a counter of updaters. Before an update happens + * the update partition of the counter is incremented. This will + * allow the updater to update the counter atomically. + * + * The counter is 20 bits, and the state data is 12. + */ +#define RB_WRITE_MASK 0xfffff +#define RB_WRITE_INTCNT (1 << 20) + static void rb_init_page(struct buffer_data_page *bpage) { local_set(&bpage->commit, 0); @@ -403,7 +426,7 @@ int ring_buffer_print_page_header(struct trace_seq *s) struct ring_buffer_per_cpu { int cpu; struct ring_buffer *buffer; - spinlock_t reader_lock; /* serialize readers */ + spinlock_t reader_lock; /* serialize readers */ raw_spinlock_t lock; struct lock_class_key lock_key; struct list_head *pages; @@ -411,13 +434,12 @@ struct ring_buffer_per_cpu { struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; - unsigned long nmi_dropped; - unsigned long commit_overrun; - unsigned long overrun; - unsigned long read; + local_t commit_overrun; + local_t overrun; local_t entries; local_t committing; local_t commits; + unsigned long read; u64 write_stamp; u64 read_stamp; atomic_t record_disabled; @@ -489,6 +511,385 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); +/* + * Making the ring buffer lockless makes things tricky. + * Although writes only happen on the CPU that they are on, + * and they only need to worry about interrupts. Reads can + * happen on any CPU. + * + * The reader page is always off the ring buffer, but when the + * reader finishes with a page, it needs to swap its page with + * a new one from the buffer. The reader needs to take from + * the head (writes go to the tail). But if a writer is in overwrite + * mode and wraps, it must push the head page forward. + * + * Here lies the problem. + * + * The reader must be careful to replace only the head page, and + * not another one. As described at the top of the file in the + * ASCII art, the reader sets its old page to point to the next + * page after head. It then sets the page after head to point to + * the old reader page. But if the writer moves the head page + * during this operation, the reader could end up with the tail. + * + * We use cmpxchg to help prevent this race. We also do something + * special with the page before head. We set the LSB to 1. + * + * When the writer must push the page forward, it will clear the + * bit that points to the head page, move the head, and then set + * the bit that points to the new head page. + * + * We also don't want an interrupt coming in and moving the head + * page on another writer. Thus we use the second LSB to catch + * that too. Thus: + * + * head->list->prev->next bit 1 bit 0 + * ------- ------- + * Normal page 0 0 + * Points to head page 0 1 + * New head page 1 0 + * + * Note we can not trust the prev pointer of the head page, because: + * + * +----+ +-----+ +-----+ + * | |------>| T |---X--->| N | + * | |<------| | | | + * +----+ +-----+ +-----+ + * ^ ^ | + * | +-----+ | | + * +----------| R |----------+ | + * | |<-----------+ + * +-----+ + * + * Key: ---X--> HEAD flag set in pointer + * T Tail page + * R Reader page + * N Next page + * + * (see __rb_reserve_next() to see where this happens) + * + * What the above shows is that the reader just swapped out + * the reader page with a page in the buffer, but before it + * could make the new header point back to the new page added + * it was preempted by a writer. The writer moved forward onto + * the new page added by the reader and is about to move forward + * again. + * + * You can see, it is legitimate for the previous pointer of + * the head (or any page) not to point back to itself. But only + * temporarially. + */ + +#define RB_PAGE_NORMAL 0UL +#define RB_PAGE_HEAD 1UL +#define RB_PAGE_UPDATE 2UL + + +#define RB_FLAG_MASK 3UL + +/* PAGE_MOVED is not part of the mask */ +#define RB_PAGE_MOVED 4UL + +/* + * rb_list_head - remove any bit + */ +static struct list_head *rb_list_head(struct list_head *list) +{ + unsigned long val = (unsigned long)list; + + return (struct list_head *)(val & ~RB_FLAG_MASK); +} + +/* + * rb_is_head_page - test if the give page is the head page + * + * Because the reader may move the head_page pointer, we can + * not trust what the head page is (it may be pointing to + * the reader page). But if the next page is a header page, + * its flags will be non zero. + */ +static int inline +rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *page, struct list_head *list) +{ + unsigned long val; + + val = (unsigned long)list->next; + + if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list) + return RB_PAGE_MOVED; + + return val & RB_FLAG_MASK; +} + +/* + * rb_is_reader_page + * + * The unique thing about the reader page, is that, if the + * writer is ever on it, the previous pointer never points + * back to the reader page. + */ +static int rb_is_reader_page(struct buffer_page *page) +{ + struct list_head *list = page->list.prev; + + return rb_list_head(list->next) != &page->list; +} + +/* + * rb_set_list_to_head - set a list_head to be pointing to head. + */ +static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *list) +{ + unsigned long *ptr; + + ptr = (unsigned long *)&list->next; + *ptr |= RB_PAGE_HEAD; + *ptr &= ~RB_PAGE_UPDATE; +} + +/* + * rb_head_page_activate - sets up head page + */ +static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *head; + + head = cpu_buffer->head_page; + if (!head) + return; + + /* + * Set the previous list pointer to have the HEAD flag. + */ + rb_set_list_to_head(cpu_buffer, head->list.prev); +} + +static void rb_list_head_clear(struct list_head *list) +{ + unsigned long *ptr = (unsigned long *)&list->next; + + *ptr &= ~RB_FLAG_MASK; +} + +/* + * rb_head_page_dactivate - clears head page ptr (for free list) + */ +static void +rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *hd; + + /* Go through the whole list and clear any pointers found. */ + rb_list_head_clear(cpu_buffer->pages); + + list_for_each(hd, cpu_buffer->pages) + rb_list_head_clear(hd); +} + +static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag, int new_flag) +{ + struct list_head *list; + unsigned long val = (unsigned long)&head->list; + unsigned long ret; + + list = &prev->list; + + val &= ~RB_FLAG_MASK; + + ret = (unsigned long)cmpxchg(&list->next, + val | old_flag, val | new_flag); + + /* check if the reader took the page */ + if ((ret & ~RB_FLAG_MASK) != val) + return RB_PAGE_MOVED; + + return ret & RB_FLAG_MASK; +} + +static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_UPDATE); +} + +static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_HEAD); +} + +static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_NORMAL); +} + +static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page **bpage) +{ + struct list_head *p = rb_list_head((*bpage)->list.next); + + *bpage = list_entry(p, struct buffer_page, list); +} + +static struct buffer_page * +rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *head; + struct buffer_page *page; + struct list_head *list; + int i; + + if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page)) + return NULL; + + /* sanity check */ + list = cpu_buffer->pages; + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list)) + return NULL; + + page = head = cpu_buffer->head_page; + /* + * It is possible that the writer moves the header behind + * where we started, and we miss in one loop. + * A second loop should grab the header, but we'll do + * three loops just because I'm paranoid. + */ + for (i = 0; i < 3; i++) { + do { + if (rb_is_head_page(cpu_buffer, page, page->list.prev)) { + cpu_buffer->head_page = page; + return page; + } + rb_inc_page(cpu_buffer, &page); + } while (page != head); + } + + RB_WARN_ON(cpu_buffer, 1); + + return NULL; +} + +static int rb_head_page_replace(struct buffer_page *old, + struct buffer_page *new) +{ + unsigned long *ptr = (unsigned long *)&old->list.prev->next; + unsigned long val; + unsigned long ret; + + val = *ptr & ~RB_FLAG_MASK; + val |= RB_PAGE_HEAD; + + ret = cmpxchg(ptr, val, &new->list); + + return ret == val; +} + +/* + * rb_tail_page_update - move the tail page forward + * + * Returns 1 if moved tail page, 0 if someone else did. + */ +static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + struct buffer_page *next_page) +{ + struct buffer_page *old_tail; + unsigned long old_entries; + unsigned long old_write; + int ret = 0; + + /* + * The tail page now needs to be moved forward. + * + * We need to reset the tail page, but without messing + * with possible erasing of data brought in by interrupts + * that have moved the tail page and are currently on it. + * + * We add a counter to the write field to denote this. + */ + old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); + old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); + + /* + * Just make sure we have seen our old_write and synchronize + * with any interrupts that come in. + */ + barrier(); + + /* + * If the tail page is still the same as what we think + * it is, then it is up to us to update the tail + * pointer. + */ + if (tail_page == cpu_buffer->tail_page) { + /* Zero the write counter */ + unsigned long val = old_write & ~RB_WRITE_MASK; + unsigned long eval = old_entries & ~RB_WRITE_MASK; + + /* + * This will only succeed if an interrupt did + * not come in and change it. In which case, we + * do not want to modify it. + */ + local_cmpxchg(&next_page->write, old_write, val); + local_cmpxchg(&next_page->entries, old_entries, eval); + + /* + * No need to worry about races with clearing out the commit. + * it only can increment when a commit takes place. But that + * only happens in the outer most nested commit. + */ + local_set(&next_page->page->commit, 0); + + old_tail = cmpxchg(&cpu_buffer->tail_page, + tail_page, next_page); + + if (old_tail == tail_page) + ret = 1; + } + + return ret; +} + +static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *bpage) +{ + unsigned long val = (unsigned long)bpage; + + if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK)) + return 1; + + return 0; +} + +/** + * rb_check_list - make sure a pointer to a list has the last bits zero + */ +static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *list) +{ + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev)) + return 1; + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next)) + return 1; + return 0; +} + /** * check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test @@ -501,11 +902,16 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; + rb_head_page_deactivate(cpu_buffer); + if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) return -1; if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) return -1; + if (rb_check_list(cpu_buffer, head)) + return -1; + list_for_each_entry_safe(bpage, tmp, head, list) { if (RB_WARN_ON(cpu_buffer, bpage->list.next->prev != &bpage->list)) @@ -513,8 +919,12 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) if (RB_WARN_ON(cpu_buffer, bpage->list.prev->next != &bpage->list)) return -1; + if (rb_check_list(cpu_buffer, &bpage->list)) + return -1; } + rb_head_page_activate(cpu_buffer); + return 0; } @@ -533,6 +943,9 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); if (!bpage) goto free_pages; + + rb_check_bpage(cpu_buffer, bpage); + list_add(&bpage->list, &pages); addr = __get_free_page(GFP_KERNEL); @@ -586,6 +999,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) if (!bpage) goto fail_free_buffer; + rb_check_bpage(cpu_buffer, bpage); + cpu_buffer->reader_page = bpage; addr = __get_free_page(GFP_KERNEL); if (!addr) @@ -603,6 +1018,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) = list_entry(cpu_buffer->pages, struct buffer_page, list); cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + rb_head_page_activate(cpu_buffer); + return cpu_buffer; fail_free_reader: @@ -620,6 +1037,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) free_buffer_page(cpu_buffer->reader_page); + rb_head_page_deactivate(cpu_buffer); + if (head) { list_for_each_entry_safe(bpage, tmp, head, list) { list_del_init(&bpage->list); @@ -770,6 +1189,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); + rb_head_page_deactivate(cpu_buffer); + for (i = 0; i < nr_pages; i++) { if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) return; @@ -800,6 +1221,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); + spin_lock_irq(&cpu_buffer->reader_lock); + rb_head_page_deactivate(cpu_buffer); + for (i = 0; i < nr_pages; i++) { if (RB_WARN_ON(cpu_buffer, list_empty(pages))) return; @@ -809,6 +1233,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add_tail(&bpage->list, cpu_buffer->pages); } rb_reset_cpu(cpu_buffer); + spin_unlock_irq(&cpu_buffer->reader_lock); rb_check_pages(cpu_buffer); @@ -958,22 +1383,15 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->read); } -static inline struct ring_buffer_event * -rb_head_event(struct ring_buffer_per_cpu *cpu_buffer) -{ - return __rb_page_index(cpu_buffer->head_page, - cpu_buffer->head_page->read); -} - static inline struct ring_buffer_event * rb_iter_head_event(struct ring_buffer_iter *iter) { return __rb_page_index(iter->head_page, iter->head); } -static inline unsigned rb_page_write(struct buffer_page *bpage) +static inline unsigned long rb_page_write(struct buffer_page *bpage) { - return local_read(&bpage->write); + return local_read(&bpage->write) & RB_WRITE_MASK; } static inline unsigned rb_page_commit(struct buffer_page *bpage) @@ -981,6 +1399,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage) return local_read(&bpage->page->commit); } +static inline unsigned long rb_page_entries(struct buffer_page *bpage) +{ + return local_read(&bpage->entries) & RB_WRITE_MASK; +} + /* Size is determined by what has been commited */ static inline unsigned rb_page_size(struct buffer_page *bpage) { @@ -993,19 +1416,6 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) return rb_page_commit(cpu_buffer->commit_page); } -static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) -{ - return rb_page_commit(cpu_buffer->head_page); -} - -static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page **bpage) -{ - struct list_head *p = (*bpage)->list.next; - - *bpage = list_entry(p, struct buffer_page, list); -} - static inline unsigned rb_event_index(struct ring_buffer_event *event) { @@ -1031,6 +1441,8 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, static void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { + unsigned long max_count; + /* * We only race with interrupts and NMIs on this CPU. * If we own the commit event, then we can commit @@ -1040,9 +1452,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) * assign the commit to the tail. */ again: + max_count = cpu_buffer->buffer->pages * 100; + while (cpu_buffer->commit_page != cpu_buffer->tail_page) { - cpu_buffer->commit_page->page->commit = - cpu_buffer->commit_page->write; + if (RB_WARN_ON(cpu_buffer, !(--max_count))) + return; + if (RB_WARN_ON(cpu_buffer, + rb_is_reader_page(cpu_buffer->tail_page))) + return; + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); cpu_buffer->write_stamp = cpu_buffer->commit_page->page->time_stamp; @@ -1051,8 +1470,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) } while (rb_commit_index(cpu_buffer) != rb_page_write(cpu_buffer->commit_page)) { - cpu_buffer->commit_page->page->commit = - cpu_buffer->commit_page->write; + + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + RB_WARN_ON(cpu_buffer, + local_read(&cpu_buffer->commit_page->page->commit) & + ~RB_WRITE_MASK); barrier(); } @@ -1085,7 +1508,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) * to the head page instead of next. */ if (iter->head_page == cpu_buffer->reader_page) - iter->head_page = cpu_buffer->head_page; + iter->head_page = rb_set_head_page(cpu_buffer); else rb_inc_page(cpu_buffer, &iter->head_page); @@ -1129,6 +1552,163 @@ rb_update_event(struct ring_buffer_event *event, } } +/* + * rb_handle_head_page - writer hit the head page + * + * Returns: +1 to retry page + * 0 to continue + * -1 on error + */ +static int +rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + struct buffer_page *next_page) +{ + struct buffer_page *new_head; + int entries; + int type; + int ret; + + entries = rb_page_entries(next_page); + + /* + * The hard part is here. We need to move the head + * forward, and protect against both readers on + * other CPUs and writers coming in via interrupts. + */ + type = rb_head_page_set_update(cpu_buffer, next_page, tail_page, + RB_PAGE_HEAD); + + /* + * type can be one of four: + * NORMAL - an interrupt already moved it for us + * HEAD - we are the first to get here. + * UPDATE - we are the interrupt interrupting + * a current move. + * MOVED - a reader on another CPU moved the next + * pointer to its reader page. Give up + * and try again. + */ + + switch (type) { + case RB_PAGE_HEAD: + /* + * We changed the head to UPDATE, thus + * it is our responsibility to update + * the counters. + */ + local_add(entries, &cpu_buffer->overrun); + + /* + * The entries will be zeroed out when we move the + * tail page. + */ + + /* still more to do */ + break; + + case RB_PAGE_UPDATE: + /* + * This is an interrupt that interrupt the + * previous update. Still more to do. + */ + break; + case RB_PAGE_NORMAL: + /* + * An interrupt came in before the update + * and processed this for us. + * Nothing left to do. + */ + return 1; + case RB_PAGE_MOVED: + /* + * The reader is on another CPU and just did + * a swap with our next_page. + * Try again. + */ + return 1; + default: + RB_WARN_ON(cpu_buffer, 1); /* WTF??? */ + return -1; + } + + /* + * Now that we are here, the old head pointer is + * set to UPDATE. This will keep the reader from + * swapping the head page with the reader page. + * The reader (on another CPU) will spin till + * we are finished. + * + * We just need to protect against interrupts + * doing the job. We will set the next pointer + * to HEAD. After that, we set the old pointer + * to NORMAL, but only if it was HEAD before. + * otherwise we are an interrupt, and only + * want the outer most commit to reset it. + */ + new_head = next_page; + rb_inc_page(cpu_buffer, &new_head); + + ret = rb_head_page_set_head(cpu_buffer, new_head, next_page, + RB_PAGE_NORMAL); + + /* + * Valid returns are: + * HEAD - an interrupt came in and already set it. + * NORMAL - One of two things: + * 1) We really set it. + * 2) A bunch of interrupts came in and moved + * the page forward again. + */ + switch (ret) { + case RB_PAGE_HEAD: + case RB_PAGE_NORMAL: + /* OK */ + break; + default: + RB_WARN_ON(cpu_buffer, 1); + return -1; + } + + /* + * It is possible that an interrupt came in, + * set the head up, then more interrupts came in + * and moved it again. When we get back here, + * the page would have been set to NORMAL but we + * just set it back to HEAD. + * + * How do you detect this? Well, if that happened + * the tail page would have moved. + */ + if (ret == RB_PAGE_NORMAL) { + /* + * If the tail had moved passed next, then we need + * to reset the pointer. + */ + if (cpu_buffer->tail_page != tail_page && + cpu_buffer->tail_page != next_page) + rb_head_page_set_normal(cpu_buffer, new_head, + next_page, + RB_PAGE_HEAD); + } + + /* + * If this was the outer most commit (the one that + * changed the original pointer from HEAD to UPDATE), + * then it is up to us to reset it to NORMAL. + */ + if (type == RB_PAGE_HEAD) { + ret = rb_head_page_set_normal(cpu_buffer, next_page, + tail_page, + RB_PAGE_UPDATE); + if (RB_WARN_ON(cpu_buffer, + ret != RB_PAGE_UPDATE)) + return -1; + } + + return 0; +} + static unsigned rb_calculate_event_length(unsigned length) { struct ring_buffer_event event; /* Used only for sizeof array */ @@ -1207,96 +1787,93 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *commit_page, struct buffer_page *tail_page, u64 *ts) { - struct buffer_page *next_page, *head_page, *reader_page; struct ring_buffer *buffer = cpu_buffer->buffer; - bool lock_taken = false; - unsigned long flags; + struct buffer_page *next_page; + int ret; next_page = tail_page; - local_irq_save(flags); - /* - * Since the write to the buffer is still not - * fully lockless, we must be careful with NMIs. - * The locks in the writers are taken when a write - * crosses to a new page. The locks protect against - * races with the readers (this will soon be fixed - * with a lockless solution). - * - * Because we can not protect against NMIs, and we - * want to keep traces reentrant, we need to manage - * what happens when we are in an NMI. - * - * NMIs can happen after we take the lock. - * If we are in an NMI, only take the lock - * if it is not already taken. Otherwise - * simply fail. - */ - if (unlikely(in_nmi())) { - if (!__raw_spin_trylock(&cpu_buffer->lock)) { - cpu_buffer->nmi_dropped++; - goto out_reset; - } - } else - __raw_spin_lock(&cpu_buffer->lock); - - lock_taken = true; - rb_inc_page(cpu_buffer, &next_page); - head_page = cpu_buffer->head_page; - reader_page = cpu_buffer->reader_page; - - /* we grabbed the lock before incrementing */ - if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) - goto out_reset; - /* * If for some reason, we had an interrupt storm that made * it all the way around the buffer, bail, and warn * about it. */ if (unlikely(next_page == commit_page)) { - cpu_buffer->commit_overrun++; + local_inc(&cpu_buffer->commit_overrun); goto out_reset; } - if (next_page == head_page) { - if (!(buffer->flags & RB_FL_OVERWRITE)) - goto out_reset; - - /* tail_page has not moved yet? */ - if (tail_page == cpu_buffer->tail_page) { - /* count overflows */ - cpu_buffer->overrun += - local_read(&head_page->entries); + /* + * This is where the fun begins! + * + * We are fighting against races between a reader that + * could be on another CPU trying to swap its reader + * page with the buffer head. + * + * We are also fighting against interrupts coming in and + * moving the head or tail on us as well. + * + * If the next page is the head page then we have filled + * the buffer, unless the commit page is still on the + * reader page. + */ + if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) { - rb_inc_page(cpu_buffer, &head_page); - cpu_buffer->head_page = head_page; - cpu_buffer->head_page->read = 0; + /* + * If the commit is not on the reader page, then + * move the header page. + */ + if (!rb_is_reader_page(cpu_buffer->commit_page)) { + /* + * If we are not in overwrite mode, + * this is easy, just stop here. + */ + if (!(buffer->flags & RB_FL_OVERWRITE)) + goto out_reset; + + ret = rb_handle_head_page(cpu_buffer, + tail_page, + next_page); + if (ret < 0) + goto out_reset; + if (ret) + goto out_again; + } else { + /* + * We need to be careful here too. The + * commit page could still be on the reader + * page. We could have a small buffer, and + * have filled up the buffer with events + * from interrupts and such, and wrapped. + * + * Note, if the tail page is also the on the + * reader_page, we let it move out. + */ + if (unlikely((cpu_buffer->commit_page != + cpu_buffer->tail_page) && + (cpu_buffer->commit_page == + cpu_buffer->reader_page))) { + local_inc(&cpu_buffer->commit_overrun); + goto out_reset; + } } } - /* - * If the tail page is still the same as what we think - * it is, then it is up to us to update the tail - * pointer. - */ - if (tail_page == cpu_buffer->tail_page) { - local_set(&next_page->write, 0); - local_set(&next_page->entries, 0); - local_set(&next_page->page->commit, 0); - cpu_buffer->tail_page = next_page; - - /* reread the time stamp */ + ret = rb_tail_page_update(cpu_buffer, tail_page, next_page); + if (ret) { + /* + * Nested commits always have zero deltas, so + * just reread the time stamp + */ *ts = rb_time_stamp(buffer, cpu_buffer->cpu); - cpu_buffer->tail_page->page->time_stamp = *ts; + next_page->page->time_stamp = *ts; } - rb_reset_tail(cpu_buffer, tail_page, tail, length); + out_again: - __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); + rb_reset_tail(cpu_buffer, tail_page, tail, length); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); @@ -1305,9 +1882,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, /* reset write */ rb_reset_tail(cpu_buffer, tail_page, tail, length); - if (likely(lock_taken)) - __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); return NULL; } @@ -1324,6 +1898,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, barrier(); tail_page = cpu_buffer->tail_page; write = local_add_return(length, &tail_page->write); + + /* set write to only the index of the write */ + write &= RB_WRITE_MASK; tail = write - length; /* See if we shot pass the end of this buffer page */ @@ -1368,12 +1945,16 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, bpage = cpu_buffer->tail_page; if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { + unsigned long write_mask = + local_read(&bpage->write) & ~RB_WRITE_MASK; /* * This is on the tail page. It is possible that * a write could come in and move the tail page * and write to the next page. That is fine * because we just shorten what is on this page. */ + old_index += write_mask; + new_index += write_mask; index = local_cmpxchg(&bpage->write, old_index, new_index); if (index == old_index) return 1; @@ -1882,9 +2463,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_write); static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = cpu_buffer->reader_page; - struct buffer_page *head = cpu_buffer->head_page; + struct buffer_page *head = rb_set_head_page(cpu_buffer); struct buffer_page *commit = cpu_buffer->commit_page; + /* In case of error, head will be NULL */ + if (unlikely(!head)) + return 1; + return reader->read == rb_page_commit(reader) && (commit == reader || (commit == head && @@ -1975,7 +2560,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) + ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun)) - cpu_buffer->read; return ret; @@ -1996,32 +2581,12 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->overrun; + ret = local_read(&cpu_buffer->overrun); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); -/** - * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped - * @buffer: The ring buffer - * @cpu: The per CPU buffer to get the number of overruns from - */ -unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long ret; - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 0; - - cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->nmi_dropped; - - return ret; -} -EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu); - /** * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits * @buffer: The ring buffer @@ -2037,7 +2602,7 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->commit_overrun; + ret = local_read(&cpu_buffer->commit_overrun); return ret; } @@ -2060,7 +2625,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; entries += (local_read(&cpu_buffer->entries) - - cpu_buffer->overrun) - cpu_buffer->read; + local_read(&cpu_buffer->overrun)) - cpu_buffer->read; } return entries; @@ -2083,7 +2648,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer) /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - overruns += cpu_buffer->overrun; + overruns += local_read(&cpu_buffer->overrun); } return overruns; @@ -2096,8 +2661,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) /* Iterator usage is expected to have record disabled */ if (list_empty(&cpu_buffer->reader_page->list)) { - iter->head_page = cpu_buffer->head_page; - iter->head = cpu_buffer->head_page->read; + iter->head_page = rb_set_head_page(cpu_buffer); + if (unlikely(!iter->head_page)) + return; + iter->head = iter->head_page->read; } else { iter->head_page = cpu_buffer->reader_page; iter->head = cpu_buffer->reader_page->read; @@ -2214,6 +2781,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) struct buffer_page *reader = NULL; unsigned long flags; int nr_loops = 0; + int ret; local_irq_save(flags); __raw_spin_lock(&cpu_buffer->lock); @@ -2247,11 +2815,17 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto out; /* - * Splice the empty reader page into the list around the head. * Reset the reader page to size zero. */ + local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); - reader = cpu_buffer->head_page; + spin: + /* + * Splice the empty reader page into the list around the head. + */ + reader = rb_set_head_page(cpu_buffer); cpu_buffer->reader_page->list.next = reader->list.next; cpu_buffer->reader_page->list.prev = reader->list.prev; @@ -2262,22 +2836,35 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) */ cpu_buffer->pages = reader->list.prev; - local_set(&cpu_buffer->reader_page->write, 0); - local_set(&cpu_buffer->reader_page->entries, 0); - local_set(&cpu_buffer->reader_page->page->commit, 0); + /* The reader page will be pointing to the new head */ + rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); - /* Make the reader page now replace the head */ - reader->list.prev->next = &cpu_buffer->reader_page->list; - reader->list.next->prev = &cpu_buffer->reader_page->list; + /* + * Here's the tricky part. + * + * We need to move the pointer past the header page. + * But we can only do that if a writer is not currently + * moving it. The page before the header page has the + * flag bit '1' set if it is pointing to the page we want. + * but if the writer is in the process of moving it + * than it will be '2' or already moved '0'. + */ + + ret = rb_head_page_replace(reader, cpu_buffer->reader_page); /* - * If the tail is on the reader, then we must set the head - * to the inserted page, otherwise we set it one before. + * If we did not convert it, then we must try again. */ - cpu_buffer->head_page = cpu_buffer->reader_page; + if (!ret) + goto spin; - if (cpu_buffer->commit_page != reader) - rb_inc_page(cpu_buffer, &cpu_buffer->head_page); + /* + * Yeah! We succeeded in replacing the page. + * + * Now make the new head point back to the reader page. + */ + reader->list.next->prev = &cpu_buffer->reader_page->list; + rb_inc_page(cpu_buffer, &cpu_buffer->head_page); /* Finally update the reader page to the new head */ cpu_buffer->reader_page = reader; @@ -2733,6 +3320,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_size); static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { + rb_head_page_deactivate(cpu_buffer); + cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); @@ -2750,16 +3339,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; - cpu_buffer->nmi_dropped = 0; - cpu_buffer->commit_overrun = 0; - cpu_buffer->overrun = 0; - cpu_buffer->read = 0; + local_set(&cpu_buffer->commit_overrun, 0); + local_set(&cpu_buffer->overrun, 0); local_set(&cpu_buffer->entries, 0); local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); + cpu_buffer->read = 0; cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; + + rb_head_page_activate(cpu_buffer); } /** @@ -3107,7 +3697,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, read = 0; } else { /* update the entry counter */ - cpu_buffer->read += local_read(&reader->entries); + cpu_buffer->read += rb_page_entries(reader); /* swap the pages */ rb_init_page(bpage); -- cgit From da706d8bc833e7153622435560422e653bdb2e94 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 15 Jul 2009 16:27:30 +0800 Subject: ring_buffer: Fix warning while ignoring cmpxchg return value kernel/trace/ring_buffer.c: In function 'rb_tail_page_update': kernel/trace/ring_buffer.c:849: warning: value computed is not used kernel/trace/ring_buffer.c:850: warning: value computed is not used Add "(void)"s to fix this warning, because we don't need here to handle the fail case of cmpxchg, it's fine if an interrupt already did the job. Changed from V1: Add a comment(which is written by Steven) for it. Signed-off-by: Lai Jiangshan Acked-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- kernel/trace/ring_buffer.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e648ba4f70e0..51633d74a21e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -845,9 +845,14 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, * This will only succeed if an interrupt did * not come in and change it. In which case, we * do not want to modify it. + * + * We add (void) to let the compiler know that we do not care + * about the return value of these functions. We use the + * cmpxchg to only update if an interrupt did not already + * do it for us. If the cmpxchg fails, we don't care. */ - local_cmpxchg(&next_page->write, old_write, val); - local_cmpxchg(&next_page->entries, old_entries, eval); + (void)local_cmpxchg(&next_page->write, old_write, val); + (void)local_cmpxchg(&next_page->entries, old_entries, eval); /* * No need to worry about races with clearing out the commit. -- cgit From 0f2541d299d233eddddee4345795e0c46264fd56 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Aug 2009 12:02:48 -0400 Subject: ring-buffer: fix check of try_to_discard result The function ring_buffer_discard_commit inversed the code path of the result of try_to_discard. It should skip incrementing the entry counter if try_to_discard succeeded. But instead, it increments the entry conder if it succeeded to discard, and does not increment it if it fails. The result of this bug is that filtering will make the stat counters incorrect. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bf27bb7a63e2..2fd1752f0c85 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1785,7 +1785,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, */ RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); - if (!rb_try_to_discard(cpu_buffer, event)) + if (rb_try_to_discard(cpu_buffer, event)) goto out; /* -- cgit From 464e85eb0e63096bd52e4c3e2a6fb8357fb95828 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Aug 2009 15:26:37 -0400 Subject: ring-buffer: do not disable ring buffer on oops_in_progress The commit: commit e0fdace10e75dac67d906213b780ff1b1a4cc360 Author: David Miller Date: Fri Aug 1 01:11:22 2008 -0700 debug_locks: set oops_in_progress if we will log messages. Otherwise lock debugging messages on runqueue locks can deadlock the system due to the wakeups performed by printk(). Signed-off-by: David S. Miller Signed-off-by: Ingo Molnar Will permanently set oops_in_progress on any lockdep failure. When this triggers it will cause any read from the ring buffer to permanently disable the ring buffer (not to mention no locking of printk). This patch removes the check. It keeps the print in NMI which makes sense. This is probably OK, since the ring buffer should not cause something to set oops_in_progress anyway. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2fd1752f0c85..2606cee433da 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2486,7 +2486,7 @@ static inline int rb_ok_to_lock(void) * buffer too. A one time deal is all you get from reading * the ring buffer from an NMI. */ - if (likely(!in_nmi() && !oops_in_progress)) + if (likely(!in_nmi())) return 1; tracing_off_permanent(); -- cgit From 469535a598f28c13a2a42037e1b778f671af1d16 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 30 Jul 2009 19:19:18 +0200 Subject: ring-buffer: Fix advance of reader in rb_buffer_peek() When calling rb_buffer_peek() from ring_buffer_consume() and a padding event is returned, the function rb_advance_reader() is called twice. This may lead to missing samples or under high workloads to the warning below. This patch fixes this. If a padding event is returned by rb_buffer_peek() it will be consumed by the calling function now. Also, I simplified some code in ring_buffer_consume(). ------------[ cut here ]------------ WARNING: at /dev/shm/.source/linux/kernel/trace/ring_buffer.c:2289 rb_advance_reader+0x2e/0xc5() Hardware name: Anaheim Modules linked in: Pid: 29, comm: events/2 Tainted: G W 2.6.31-rc3-oprofile-x86_64-standard-00059-g5050dc2 #1 Call Trace: [] ? rb_advance_reader+0x2e/0xc5 [] warn_slowpath_common+0x77/0x8f [] warn_slowpath_null+0xf/0x11 [] rb_advance_reader+0x2e/0xc5 [] ring_buffer_consume+0xa0/0xd2 [] op_cpu_buffer_read_entry+0x21/0x9e [] ? __find_get_block+0x4b/0x165 [] sync_buffer+0xa5/0x401 [] ? __find_get_block+0x4b/0x165 [] ? wq_sync_buffer+0x0/0x78 [] wq_sync_buffer+0x5b/0x78 [] worker_thread+0x113/0x1ac [] ? autoremove_wake_function+0x0/0x38 [] ? worker_thread+0x0/0x1ac [] kthread+0x88/0x92 [] child_rip+0xa/0x20 [] ? kthread+0x0/0x92 [] ? child_rip+0x0/0x20 ---[ end trace f561c0a58fcc89bd ]--- Cc: Steven Rostedt Cc: Signed-off-by: Robert Richter Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2606cee433da..d4d3580a894a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2383,7 +2383,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) * the box. Return the padding, and we will release * the current locks, and try again. */ - rb_advance_reader(cpu_buffer); return event; case RINGBUF_TYPE_TIME_EXTEND: @@ -2519,6 +2518,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) if (dolock) spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(buffer, cpu, ts); + if (event && event->type_len == RINGBUF_TYPE_PADDING) + rb_advance_reader(cpu_buffer); if (dolock) spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); @@ -2590,12 +2591,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(buffer, cpu, ts); - if (!event) - goto out_unlock; - - rb_advance_reader(cpu_buffer); + if (event) + rb_advance_reader(cpu_buffer); - out_unlock: if (dolock) spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); -- cgit From bd3f02212d6a457267e0c9c02c426151c436d9d4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Aug 2009 12:49:29 +0200 Subject: ring-buffer: Fix memleak in ring_buffer_free() I noticed oprofile memleaked in linux-2.6 current tree, and tracked this ring-buffer leak. Signed-off-by: Eric Dumazet LKML-Reference: <4A7C06B9.2090302@gmail.com> Cc: stable@kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d4d3580a894a..a330513d96ce 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -735,6 +735,7 @@ ring_buffer_free(struct ring_buffer *buffer) put_online_cpus(); + kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); kfree(buffer); -- cgit From 41b6a95d693319f804607b559893fbbd27498548 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 2 Sep 2009 09:59:48 -0400 Subject: ring-buffer: do not reset while in a commit The callers of reset must ensure that no commit can be taking place at the time of the reset. If it does then we may corrupt the ring buffer. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index da2c59d8f486..79d6012bb1f1 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3373,12 +3373,16 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) + goto out; + __raw_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); __raw_spin_unlock(&cpu_buffer->lock); + out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->record_disabled); -- cgit From 98277991a99734f3a31d638afb47d4484ac73e43 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 2 Sep 2009 10:56:15 -0400 Subject: ring-buffer: do not swap buffers during a commit If a commit is taking place on a CPU ring buffer, do not allow it to be swapped. Return -EBUSY when this is detected instead. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 79d6012bb1f1..2878bd43a59c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3519,16 +3519,23 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, atomic_inc(&cpu_buffer_a->record_disabled); atomic_inc(&cpu_buffer_b->record_disabled); + ret = -EBUSY; + if (local_read(&cpu_buffer_a->committing)) + goto out_dec; + if (local_read(&cpu_buffer_b->committing)) + goto out_dec; + buffer_a->buffers[cpu] = cpu_buffer_b; buffer_b->buffers[cpu] = cpu_buffer_a; cpu_buffer_b->buffer = buffer_a; cpu_buffer_a->buffer = buffer_b; + ret = 0; + +out_dec: atomic_dec(&cpu_buffer_a->record_disabled); atomic_dec(&cpu_buffer_b->record_disabled); - - ret = 0; out: return ret; } -- cgit From 1b959e18c4d6b4b981f887260b0f8e7939efa411 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Sep 2009 10:12:13 -0400 Subject: ring-buffer: remove unnecessary cpu_relax The loops in the ring buffer that use cpu_relax are not dependent on other CPUs. They simply came across some padding in the ring buffer and are skipping over them. It is a normal loop and does not require a cpu_relax. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2878bd43a59c..a05541a8fbae 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3132,10 +3132,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); - if (event && event->type_len == RINGBUF_TYPE_PADDING) { - cpu_relax(); + if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; - } return event; } @@ -3160,10 +3158,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) event = rb_iter_peek(iter, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type_len == RINGBUF_TYPE_PADDING) { - cpu_relax(); + if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; - } return event; } @@ -3209,10 +3205,8 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) out: preempt_enable(); - if (event && event->type_len == RINGBUF_TYPE_PADDING) { - cpu_relax(); + if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; - } return event; } @@ -3302,10 +3296,8 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type_len == RINGBUF_TYPE_PADDING) { - cpu_relax(); + if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; - } return event; } -- cgit From 7e9391cfedce34eb9786bfa69d7d545dc93ef930 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Sep 2009 10:02:09 -0400 Subject: ring-buffer: fix ring_buffer_read crossing pages When the ring buffer uses an iterator (static read mode, not on the fly reading), when it crosses a page boundery, it will skip the first entry on the next page. The reason is that the last entry of a page is usually padding if the page is not full. The padding will not be returned to the user. The problem arises on ring_buffer_read because it also increments the iterator. Because both the read and peek use the same rb_iter_peek, the rb_iter_peak will return the padding but also increment to the next item. This is because the ring_buffer_peek will not incerment it itself. The ring_buffer_read will increment it again and then call rb_iter_peek again to get the next item. But that will be the second item, not the first one on the page. The reason this never showed up before, is because the ftrace utility always calls ring_buffer_peek first and only uses ring_buffer_read to increment to the next item. The ring_buffer_peek will always keep the pointer to a valid item and not padding. This just hid the bug. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a05541a8fbae..9d939e7ca924 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3286,19 +3286,19 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; - again: spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + again: event = rb_iter_peek(iter, ts); if (!event) goto out; + if (event->type_len == RINGBUF_TYPE_PADDING) + goto again; + rb_advance_iter(iter); out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type_len == RINGBUF_TYPE_PADDING) - goto again; - return event; } EXPORT_SYMBOL_GPL(ring_buffer_read); -- cgit From dc892f7339af2d125478b800edb9081d6149665b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Sep 2009 15:33:41 -0400 Subject: ring-buffer: remove ring_buffer_event_discard The function ring_buffer_event_discard can be used on any item in the ring buffer, even after the item was committed. This function provides no safety nets and is very race prone. An item may be safely removed from the ring buffer before it is committed with the ring_buffer_discard_commit. Since there are currently no users of this function, and because this function is racey and error prone, this patch removes it altogether. Note, removing this function also allows the counters to ignore all discarded events (patches will follow). Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9d939e7ca924..092fe0c8fdae 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2327,32 +2327,17 @@ static inline void rb_event_discard(struct ring_buffer_event *event) event->time_delta = 1; } -/** - * ring_buffer_event_discard - discard any event in the ring buffer - * @event: the event to discard - * - * Sometimes a event that is in the ring buffer needs to be ignored. - * This function lets the user discard an event in the ring buffer - * and then that event will not be read later. - * - * Note, it is up to the user to be careful with this, and protect - * against races. If the user discards an event that has been consumed - * it is possible that it could corrupt the ring buffer. - */ -void ring_buffer_event_discard(struct ring_buffer_event *event) -{ - rb_event_discard(event); -} -EXPORT_SYMBOL_GPL(ring_buffer_event_discard); - /** * ring_buffer_commit_discard - discard an event that has not been committed * @buffer: the ring buffer * @event: non committed event to discard * - * This is similar to ring_buffer_event_discard but must only be - * performed on an event that has not been committed yet. The difference - * is that this will also try to free the event from the ring buffer + * Sometimes an event that is in the ring buffer needs to be ignored. + * This function lets the user discard an event in the ring buffer + * and then that event will not be read later. + * + * This function only works if it is called before the the item has been + * committed. It will try to free the event from the ring buffer * if another event has not been added behind it. * * If another event has been added behind it, it will set the event -- cgit From a1863c212b7517afc2b13e549552ac322fb44cab Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Sep 2009 10:23:58 -0400 Subject: ring-buffer: do not count discarded events The latency tracers report the number of items in the trace buffer. This uses the ring buffer data to calculate this. Because discarded events are also counted, the numbers do not match the number of items that are printed. The ring buffer also adds a "padding" item to the end of each buffer page which also gets counted as a discarded item. This patch decrements the counter to the page entries on a discard. This allows us to ignore discarded entries while reading the buffer. Decrementing the counter is still safe since it can only happen while the committing flag is still set. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 71 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 17 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 092fe0c8fdae..c8d2a66e1d1f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -218,17 +218,12 @@ enum { static inline int rb_null_event(struct ring_buffer_event *event) { - return event->type_len == RINGBUF_TYPE_PADDING - && event->time_delta == 0; -} - -static inline int rb_discarded_event(struct ring_buffer_event *event) -{ - return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta; + return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; } static void rb_event_set_padding(struct ring_buffer_event *event) { + /* padding has a NULL time_delta */ event->type_len = RINGBUF_TYPE_PADDING; event->time_delta = 0; } @@ -1778,9 +1773,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ event->time_delta = 1; - /* Account for this as an entry */ - local_inc(&tail_page->entries); - local_inc(&cpu_buffer->entries); /* Set write to end of buffer */ length = (tail + length) - BUF_PAGE_SIZE; @@ -2269,18 +2261,23 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) } EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); -static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, +static void +rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { - local_inc(&cpu_buffer->entries); - /* * The event first in the commit queue updates the * time stamp. */ if (rb_event_is_commit(cpu_buffer, event)) cpu_buffer->write_stamp += event->time_delta; +} +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + local_inc(&cpu_buffer->entries); + rb_update_write_stamp(cpu_buffer, event); rb_end_commit(cpu_buffer); } @@ -2327,6 +2324,46 @@ static inline void rb_event_discard(struct ring_buffer_event *event) event->time_delta = 1; } +/* + * Decrement the entries to the page that an event is on. + * The event does not even need to exist, only the pointer + * to the page it is on. This may only be called before the commit + * takes place. + */ +static inline void +rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + struct buffer_page *bpage = cpu_buffer->commit_page; + struct buffer_page *start; + + addr &= PAGE_MASK; + + /* Do the likely case first */ + if (likely(bpage->page == (void *)addr)) { + local_dec(&bpage->entries); + return; + } + + /* + * Because the commit page may be on the reader page we + * start with the next page and check the end loop there. + */ + rb_inc_page(cpu_buffer, &bpage); + start = bpage; + do { + if (bpage->page == (void *)addr) { + local_dec(&bpage->entries); + return; + } + rb_inc_page(cpu_buffer, &bpage); + } while (bpage != start); + + /* commit not part of this buffer?? */ + RB_WARN_ON(cpu_buffer, 1); +} + /** * ring_buffer_commit_discard - discard an event that has not been committed * @buffer: the ring buffer @@ -2365,14 +2402,15 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, */ RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); + rb_decrement_entry(cpu_buffer, event); if (rb_try_to_discard(cpu_buffer, event)) goto out; /* * The commit is still visible by the reader, so we - * must increment entries. + * must still update the timestamp. */ - local_inc(&cpu_buffer->entries); + rb_update_write_stamp(cpu_buffer, event); out: rb_end_commit(cpu_buffer); @@ -2884,8 +2922,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) event = rb_reader_event(cpu_buffer); - if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX - || rb_discarded_event(event)) + if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX) cpu_buffer->read++; rb_update_read_stamp(cpu_buffer, event); -- cgit From 077c5407cd3231cf13472623995f0dfdda510d62 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Sep 2009 19:53:46 -0400 Subject: ring-buffer: disable all cpu buffers when one finds a problem Currently the way RB_WARN_ON works, is to disable either the current CPU buffer or all CPU buffers, depending on whether a ring_buffer or ring_buffer_per_cpu struct was passed into the macro. Most users of the RB_WARN_ON pass in the CPU buffer, so only the one CPU buffer gets disabled but the rest are still active. This may confuse users even though a warning is sent to the console. This patch changes the macro to disable the entire buffer even if the CPU buffer is passed in. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c8d2a66e1d1f..f83a42a79ee8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -467,14 +467,19 @@ struct ring_buffer_iter { }; /* buffer may be either ring_buffer or ring_buffer_per_cpu */ -#define RB_WARN_ON(buffer, cond) \ - ({ \ - int _____ret = unlikely(cond); \ - if (_____ret) { \ - atomic_inc(&buffer->record_disabled); \ - WARN_ON(1); \ - } \ - _____ret; \ +#define RB_WARN_ON(b, cond) \ + ({ \ + int _____ret = unlikely(cond); \ + if (_____ret) { \ + if (__same_type(*(b), struct ring_buffer_per_cpu)) { \ + struct ring_buffer_per_cpu *__b = \ + (void *)b; \ + atomic_inc(&__b->buffer->record_disabled); \ + } else \ + atomic_inc(&b->record_disabled); \ + WARN_ON(1); \ + } \ + _____ret; \ }) /* Up this if you want to test the TIME_EXTENTS and normalization */ -- cgit From 62f0b3eb5cb58931a02ee4e599e19c80a171e351 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 4 Sep 2009 14:11:34 -0400 Subject: ring-buffer: check for swapped buffers in start of committing Because the irqsoff tracer can swap an internal CPU buffer, it is possible that a swap happens between the start of the write and before the committing bit is set (the committing bit will disable swapping). This patch adds a check for this and will fail the write if it detects it. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f83a42a79ee8..1766c0e8db5a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2073,7 +2073,8 @@ static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) } static struct ring_buffer_event * -rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, +rb_reserve_next_event(struct ring_buffer *buffer, + struct ring_buffer_per_cpu *cpu_buffer, unsigned long length) { struct ring_buffer_event *event; @@ -2083,6 +2084,19 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, rb_start_commit(cpu_buffer); + /* + * Due to the ability to swap a cpu buffer from a buffer + * it is possible it was swapped before we committed. + * (committing stops a swap). We check for it here and + * if it happened, we have to fail the write. + */ + barrier(); + if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { + local_dec(&cpu_buffer->committing); + local_dec(&cpu_buffer->commits); + return NULL; + } + length = rb_calculate_event_length(length); again: /* @@ -2243,7 +2257,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (length > BUF_MAX_DATA_SIZE) goto out; - event = rb_reserve_next_event(cpu_buffer, length); + event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) goto out; @@ -2476,7 +2490,7 @@ int ring_buffer_write(struct ring_buffer *buffer, if (length > BUF_MAX_DATA_SIZE) goto out; - event = rb_reserve_next_event(cpu_buffer, length); + event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) goto out; -- cgit From 85bac32c4a52c592b857f2c360cc5ec93a097d70 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 4 Sep 2009 14:24:40 -0400 Subject: ring-buffer: only enable ring_buffer_swap_cpu when needed Since the ability to swap the cpu buffers adds a small overhead to the recording of a trace, we only want to add it when needed. Only the irqsoff and preemptoff tracers use this feature, and both are not recommended for production kernels. This patch disables its use when neither irqsoff nor preemptoff is configured. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1766c0e8db5a..454e74e718cf 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2084,6 +2084,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, rb_start_commit(cpu_buffer); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /* * Due to the ability to swap a cpu buffer from a buffer * it is possible it was swapped before we committed. @@ -2096,6 +2097,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, local_dec(&cpu_buffer->commits); return NULL; } +#endif length = rb_calculate_event_length(length); again: @@ -3498,6 +3500,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /** * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers * @buffer_a: One buffer to swap with @@ -3573,6 +3576,7 @@ out: return ret; } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); +#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */ /** * ring_buffer_alloc_read_page - allocate a page to read from buffer -- cgit