ring-buffer changes for v6.17:

- Rewind persistent ring buffer on boot
 
   When the persistent ring buffer is being used for live kernel tracing and
   the system crashes, the tool that is reading the trace may not have recorded
   the data when the system crashed. Although the persistent ring buffer still
   has that data, when reading it after a reboot, it will start where it left
   off. That is, what was read will not be accessible.
 
   Instead, on reboot, have the persistent ring buffer restart where the data
   starts and this will allow the tooling to recover what was lost when the
   crash occurred.
 
 - Remove the ring_buffer_read_prepare_sync() logic
 
   Reading the trace file required stopping writing to the ring buffer as the
   trace file is only an iterator and does not consume what it read. It was
   originally not safe to read the ring buffer in this mode and required
   disabling writing. The ring_buffer_read_prepare_sync() logic was used to
   stop each per_cpu ring buffer, call synchronize_rcu() and then start the
   iterator. This was used instead of calling synchronize_rcu() for each
   per_cpu buffer.
 
   Today, the iterator has been updated where it is safe to read the trace file
   while writing to the ring buffer is still occurring. There is no more need
   to do this synchronization and it is causing large delays on machines with
   many CPUs. Remove this unneeded synchronization.
 
 - Make static string array a constant in show_irq_str()
 
   Making the string array into a constant has shown to decrease code text/data
   size.
 -----BEGIN PGP SIGNATURE-----
 
 iIoEABYKADIWIQRRSw7ePDh/lE+zeZMp5XQQmuv6qgUCaIkfURQccm9zdGVkdEBn
 b29kbWlzLm9yZwAKCRAp5XQQmuv6qnx4AQCNXOuKYJXDvXrkwf449agwrn0lCVyI
 vV0L65nyIrakpAD8COV/lw8DhlCpb/Lijlzzo5L0n9QpEElNpq5uEntNwgE=
 =1YIy
 -----END PGP SIGNATURE-----

Merge tag 'trace-ringbuffer-v6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace

Pull ring-buffer updates from Steven Rostedt:

 - Rewind persistent ring buffer on boot

   When the persistent ring buffer is being used for live kernel tracing
   and the system crashes, the tool that is reading the trace may not
   have recorded the data when the system crashed.

   Although the persistent ring buffer still has that data, when reading
   it after a reboot, it will start where it left off. That is, what was
   read will not be accessible.

   Instead, on reboot, have the persistent ring buffer restart where the
   data starts and this will allow the tooling to recover what was lost
   when the crash occurred.

 - Remove the ring_buffer_read_prepare_sync() logic

   Reading the trace file required stopping writing to the ring buffer
   as the trace file is only an iterator and does not consume what it
   read. It was originally not safe to read the ring buffer in this mode
   and required disabling writing. The ring_buffer_read_prepare_sync()
   logic was used to stop each per_cpu ring buffer, call
   synchronize_rcu() and then start the iterator. This was used instead
   of calling synchronize_rcu() for each per_cpu buffer.

   Today, the iterator has been updated where it is safe to read the
   trace file while writing to the ring buffer is still occurring. There
   is no more need to do this synchronization and it is causing large
   delays on machines with many CPUs. Remove this unneeded
   synchronization.

 - Make static string array a constant in show_irq_str()

   Making the string array into a constant has shown to decrease code
   text/data size.

* tag 'trace-ringbuffer-v6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace:
  ring-buffer: Make the const read-only 'type' static
  ring-buffer: Remove ring_buffer_read_prepare_sync()
  tracing: ring_buffer: Rewind persistent ring buffer on reboot
pull/1309/head
Linus Torvalds 2025-07-30 16:16:58 -07:00
commit d50b07d05c
4 changed files with 119 additions and 75 deletions

View File

@ -152,9 +152,7 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
unsigned long *lost_events);
struct ring_buffer_iter *
ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags);
void ring_buffer_read_prepare_sync(void);
void ring_buffer_read_start(struct ring_buffer_iter *iter);
ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags);
void ring_buffer_read_finish(struct ring_buffer_iter *iter);
struct ring_buffer_event *

View File

@ -1358,6 +1358,13 @@ static inline void rb_inc_page(struct buffer_page **bpage)
*bpage = list_entry(p, struct buffer_page, list);
}
static inline void rb_dec_page(struct buffer_page **bpage)
{
struct list_head *p = rb_list_head((*bpage)->list.prev);
*bpage = list_entry(p, struct buffer_page, list);
}
static struct buffer_page *
rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
{
@ -1866,10 +1873,11 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
{
struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
struct buffer_page *head_page;
struct buffer_page *head_page, *orig_head;
unsigned long entry_bytes = 0;
unsigned long entries = 0;
int ret;
u64 ts;
int i;
if (!meta || !meta->head_buffer)
@ -1885,8 +1893,98 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
local_set(&cpu_buffer->reader_page->entries, ret);
head_page = cpu_buffer->head_page;
orig_head = head_page = cpu_buffer->head_page;
ts = head_page->page->time_stamp;
/*
* Try to rewind the head so that we can read the pages which already
* read in the previous boot.
*/
if (head_page == cpu_buffer->tail_page)
goto skip_rewind;
rb_dec_page(&head_page);
for (i = 0; i < meta->nr_subbufs + 1; i++, rb_dec_page(&head_page)) {
/* Rewind until tail (writer) page. */
if (head_page == cpu_buffer->tail_page)
break;
/* Ensure the page has older data than head. */
if (ts < head_page->page->time_stamp)
break;
ts = head_page->page->time_stamp;
/* Ensure the page has correct timestamp and some data. */
if (!ts || rb_page_commit(head_page) == 0)
break;
/* Stop rewind if the page is invalid. */
ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
if (ret < 0)
break;
/* Recover the number of entries and update stats. */
local_set(&head_page->entries, ret);
if (ret)
local_inc(&cpu_buffer->pages_touched);
entries += ret;
entry_bytes += rb_page_commit(head_page);
}
if (i)
pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i);
/* The last rewound page must be skipped. */
if (head_page != orig_head)
rb_inc_page(&head_page);
/*
* If the ring buffer was rewound, then inject the reader page
* into the location just before the original head page.
*/
if (head_page != orig_head) {
struct buffer_page *bpage = orig_head;
rb_dec_page(&bpage);
/*
* Insert the reader_page before the original head page.
* Since the list encode RB_PAGE flags, general list
* operations should be avoided.
*/
cpu_buffer->reader_page->list.next = &orig_head->list;
cpu_buffer->reader_page->list.prev = orig_head->list.prev;
orig_head->list.prev = &cpu_buffer->reader_page->list;
bpage->list.next = &cpu_buffer->reader_page->list;
/* Make the head_page the reader page */
cpu_buffer->reader_page = head_page;
bpage = head_page;
rb_inc_page(&head_page);
head_page->list.prev = bpage->list.prev;
rb_dec_page(&bpage);
bpage->list.next = &head_page->list;
rb_set_list_to_head(&bpage->list);
cpu_buffer->pages = &head_page->list;
cpu_buffer->head_page = head_page;
meta->head_buffer = (unsigned long)head_page->page;
/* Reset all the indexes */
bpage = cpu_buffer->reader_page;
meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page);
bpage->id = 0;
for (i = 1, bpage = head_page; i < meta->nr_subbufs;
i++, rb_inc_page(&bpage)) {
meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page);
bpage->id = i;
}
/* We'll restart verifying from orig_head */
head_page = orig_head;
}
skip_rewind:
/* If the commit_buffer is the reader page, update the commit page */
if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) {
cpu_buffer->commit_page = cpu_buffer->reader_page;
@ -4118,7 +4216,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
static const char *show_irq_str(int bits)
{
const char *type[] = {
static const char * type[] = {
".", // 0
"s", // 1
"h", // 2
@ -5342,7 +5440,6 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
*/
local_set(&cpu_buffer->reader_page->write, 0);
local_set(&cpu_buffer->reader_page->entries, 0);
local_set(&cpu_buffer->reader_page->page->commit, 0);
cpu_buffer->reader_page->real_end = 0;
spin:
@ -5846,24 +5943,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
EXPORT_SYMBOL_GPL(ring_buffer_consume);
/**
* ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
* ring_buffer_read_start - start a non consuming read of the buffer
* @buffer: The ring buffer to read from
* @cpu: The cpu buffer to iterate over
* @flags: gfp flags to use for memory allocation
*
* This performs the initial preparations necessary to iterate
* through the buffer. Memory is allocated, buffer resizing
* is disabled, and the iterator pointer is returned to the caller.
* This creates an iterator to allow non-consuming iteration through
* the buffer. If the buffer is disabled for writing, it will produce
* the same information each time, but if the buffer is still writing
* then the first hit of a write will cause the iteration to stop.
*
* After a sequence of ring_buffer_read_prepare calls, the user is
* expected to make at least one call to ring_buffer_read_prepare_sync.
* Afterwards, ring_buffer_read_start is invoked to get things going
* for real.
*
* This overall must be paired with ring_buffer_read_finish.
* Must be paired with ring_buffer_read_finish.
*/
struct ring_buffer_iter *
ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_iter *iter;
@ -5889,51 +5982,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
atomic_inc(&cpu_buffer->resize_disabled);
return iter;
}
EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
/**
* ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
*
* All previously invoked ring_buffer_read_prepare calls to prepare
* iterators will be synchronized. Afterwards, read_buffer_read_start
* calls on those iterators are allowed.
*/
void
ring_buffer_read_prepare_sync(void)
{
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
/**
* ring_buffer_read_start - start a non consuming read of the buffer
* @iter: The iterator returned by ring_buffer_read_prepare
*
* This finalizes the startup of an iteration through the buffer.
* The iterator comes from a call to ring_buffer_read_prepare and
* an intervening ring_buffer_read_prepare_sync must have been
* performed.
*
* Must be paired with ring_buffer_read_finish.
*/
void
ring_buffer_read_start(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
if (!iter)
return;
cpu_buffer = iter->cpu_buffer;
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
arch_spin_lock(&cpu_buffer->lock);
rb_iter_reset(iter);
arch_spin_unlock(&cpu_buffer->lock);
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
return iter;
}
EXPORT_SYMBOL_GPL(ring_buffer_read_start);

View File

@ -4735,21 +4735,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter->buffer_iter[cpu] =
ring_buffer_read_prepare(iter->array_buffer->buffer,
cpu, GFP_KERNEL);
}
ring_buffer_read_prepare_sync();
for_each_tracing_cpu(cpu) {
ring_buffer_read_start(iter->buffer_iter[cpu]);
ring_buffer_read_start(iter->array_buffer->buffer,
cpu, GFP_KERNEL);
tracing_iter_reset(iter, cpu);
}
} else {
cpu = iter->cpu_file;
iter->buffer_iter[cpu] =
ring_buffer_read_prepare(iter->array_buffer->buffer,
cpu, GFP_KERNEL);
ring_buffer_read_prepare_sync();
ring_buffer_read_start(iter->buffer_iter[cpu]);
ring_buffer_read_start(iter->array_buffer->buffer,
cpu, GFP_KERNEL);
tracing_iter_reset(iter, cpu);
}

View File

@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
if (cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter.buffer_iter[cpu] =
ring_buffer_read_prepare(iter.array_buffer->buffer,
cpu, GFP_ATOMIC);
ring_buffer_read_start(iter.buffer_iter[cpu]);
ring_buffer_read_start(iter.array_buffer->buffer,
cpu, GFP_ATOMIC);
tracing_iter_reset(&iter, cpu);
}
} else {
iter.cpu_file = cpu_file;
iter.buffer_iter[cpu_file] =
ring_buffer_read_prepare(iter.array_buffer->buffer,
ring_buffer_read_start(iter.array_buffer->buffer,
cpu_file, GFP_ATOMIC);
ring_buffer_read_start(iter.buffer_iter[cpu_file]);
tracing_iter_reset(&iter, cpu_file);
}