perf: Support deferred user unwind

Add support for deferred userspace unwind to perf.

Where perf currently relies on in-place stack unwinding; from NMI
context and all that. This moves the userspace part of the unwind to
right before the return-to-userspace.

This has two distinct benefits, the biggest is that it moves the
unwind to a faultable context. It becomes possible to fault in debug
info (.eh_frame, SFrame etc.) that might not otherwise be readily
available. And secondly, it de-duplicates the user callchain where
multiple samples happen during the same kernel entry.

To facilitate this the perf interface is extended with a new record
type:

  PERF_RECORD_CALLCHAIN_DEFERRED

and two new attribute flags:

  perf_event_attr::defer_callchain - to request the user unwind be deferred
  perf_event_attr::defer_output    - to request PERF_RECORD_CALLCHAIN_DEFERRED records

The existing PERF_RECORD_SAMPLE callchain section gets a new
context type:

  PERF_CONTEXT_USER_DEFERRED

After which will come a single entry, denoting the 'cookie' of the
deferred callchain that should be attached here, matching the 'cookie'
field of the above mentioned PERF_RECORD_CALLCHAIN_DEFERRED.

The 'defer_callchain' flag is expected on all events with
PERF_SAMPLE_CALLCHAIN. The 'defer_output' flag is expect on the event
responsible for collecting side-band events (like mmap, comm etc.).
Setting 'defer_output' on multiple events will get you duplicated
PERF_RECORD_CALLCHAIN_DEFERRED records.

Based on earlier patches by Josh and Steven.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251023150002.GR4067720@noisy.programming.kicks-ass.net
pull/1354/merge
Peter Zijlstra 2025-10-23 15:17:05 +02:00
parent ae25884ad7
commit c69993ecdd
8 changed files with 145 additions and 20 deletions

View File

@ -1720,7 +1720,7 @@ extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct p
extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
u32 max_stack, bool crosstask, bool add_mark);
u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie);
extern int get_callchain_buffers(int max_stack);
extern void put_callchain_buffers(void);
extern struct perf_callchain_entry *get_callchain_entry(int *rctx);

View File

@ -6,18 +6,6 @@
#include <linux/unwind_user.h>
#include <linux/unwind_deferred_types.h>
struct unwind_work;
typedef void (*unwind_callback_t)(struct unwind_work *work,
struct unwind_stacktrace *trace,
u64 cookie);
struct unwind_work {
struct list_head list;
unwind_callback_t func;
int bit;
};
#ifdef CONFIG_UNWIND_USER
enum {

View File

@ -39,4 +39,17 @@ struct unwind_task_info {
union unwind_task_id id;
};
struct unwind_work;
struct unwind_stacktrace;
typedef void (*unwind_callback_t)(struct unwind_work *work,
struct unwind_stacktrace *trace,
u64 cookie);
struct unwind_work {
struct list_head list;
unwind_callback_t func;
int bit;
};
#endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */

View File

@ -463,7 +463,9 @@ struct perf_event_attr {
inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */
remove_on_exec : 1, /* event is removed from task on exec */
sigtrap : 1, /* send synchronous SIGTRAP on event */
__reserved_1 : 26;
defer_callchain: 1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */
defer_output : 1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */
__reserved_1 : 24;
union {
__u32 wakeup_events; /* wake up every n events */
@ -1239,6 +1241,22 @@ enum perf_event_type {
*/
PERF_RECORD_AUX_OUTPUT_HW_ID = 21,
/*
* This user callchain capture was deferred until shortly before
* returning to user space. Previous samples would have kernel
* callchains only and they need to be stitched with this to make full
* callchains.
*
* struct {
* struct perf_event_header header;
* u64 cookie;
* u64 nr;
* u64 ips[nr];
* struct sample_id sample_id;
* };
*/
PERF_RECORD_CALLCHAIN_DEFERRED = 22,
PERF_RECORD_MAX, /* non-ABI */
};
@ -1269,6 +1287,7 @@ enum perf_callchain_context {
PERF_CONTEXT_HV = (__u64)-32,
PERF_CONTEXT_KERNEL = (__u64)-128,
PERF_CONTEXT_USER = (__u64)-512,
PERF_CONTEXT_USER_DEFERRED = (__u64)-640,
PERF_CONTEXT_GUEST = (__u64)-2048,
PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176,

View File

@ -315,7 +315,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
max_depth = sysctl_perf_event_max_stack;
trace = get_perf_callchain(regs, kernel, user, max_depth,
false, false);
false, false, 0);
if (unlikely(!trace))
/* couldn't fetch the stack trace */
@ -452,7 +452,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
trace = get_callchain_entry_for_task(task, max_depth);
else
trace = get_perf_callchain(regs, kernel, user, max_depth,
crosstask, false);
crosstask, false, 0);
if (unlikely(!trace) || trace->nr < skip) {
if (may_fault)

View File

@ -218,7 +218,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr
struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
u32 max_stack, bool crosstask, bool add_mark)
u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie)
{
struct perf_callchain_entry *entry;
struct perf_callchain_entry_ctx ctx;
@ -251,6 +251,18 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
regs = task_pt_regs(current);
}
if (defer_cookie) {
/*
* Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED
* which can be stitched to this one, and add
* the cookie after it (it will be cut off when the
* user stack is copied to the callchain).
*/
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED);
perf_callchain_store_context(&ctx, defer_cookie);
goto exit_put;
}
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);

View File

@ -56,6 +56,7 @@
#include <linux/buildid.h>
#include <linux/task_work.h>
#include <linux/percpu-rwsem.h>
#include <linux/unwind_deferred.h>
#include "internal.h"
@ -8200,6 +8201,8 @@ static u64 perf_get_page_size(unsigned long addr)
static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
static struct unwind_work perf_unwind_work;
struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
@ -8208,8 +8211,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
!(current->flags & (PF_KTHREAD | PF_USER_WORKER));
/* Disallow cross-task user callchains. */
bool crosstask = event->ctx->task && event->ctx->task != current;
bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user &&
event->attr.defer_callchain;
const u32 max_stack = event->attr.sample_max_stack;
struct perf_callchain_entry *callchain;
u64 defer_cookie;
if (!current->mm)
user = false;
@ -8217,8 +8223,13 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
if (!kernel && !user)
return &__empty_callchain;
callchain = get_perf_callchain(regs, kernel, user,
max_stack, crosstask, true);
if (!(user && defer_user && !crosstask &&
unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0))
defer_cookie = 0;
callchain = get_perf_callchain(regs, kernel, user, max_stack,
crosstask, true, defer_cookie);
return callchain ?: &__empty_callchain;
}
@ -10003,6 +10014,66 @@ void perf_event_bpf_event(struct bpf_prog *prog,
perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
}
struct perf_callchain_deferred_event {
struct unwind_stacktrace *trace;
struct {
struct perf_event_header header;
u64 cookie;
u64 nr;
u64 ips[];
} event;
};
static void perf_callchain_deferred_output(struct perf_event *event, void *data)
{
struct perf_callchain_deferred_event *deferred_event = data;
struct perf_output_handle handle;
struct perf_sample_data sample;
int ret, size = deferred_event->event.header.size;
if (!event->attr.defer_output)
return;
/* XXX do we really need sample_id_all for this ??? */
perf_event_header__init_id(&deferred_event->event.header, &sample, event);
ret = perf_output_begin(&handle, &sample, event,
deferred_event->event.header.size);
if (ret)
goto out;
perf_output_put(&handle, deferred_event->event);
for (int i = 0; i < deferred_event->trace->nr; i++) {
u64 entry = deferred_event->trace->entries[i];
perf_output_put(&handle, entry);
}
perf_event__output_id_sample(event, &handle, &sample);
perf_output_end(&handle);
out:
deferred_event->event.header.size = size;
}
static void perf_unwind_deferred_callback(struct unwind_work *work,
struct unwind_stacktrace *trace, u64 cookie)
{
struct perf_callchain_deferred_event deferred_event = {
.trace = trace,
.event = {
.header = {
.type = PERF_RECORD_CALLCHAIN_DEFERRED,
.misc = PERF_RECORD_MISC_USER,
.size = sizeof(deferred_event.event) +
(trace->nr * sizeof(u64)),
},
.cookie = cookie,
.nr = trace->nr,
},
};
perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL);
}
struct perf_text_poke_event {
const void *old_bytes;
const void *new_bytes;
@ -14799,6 +14870,9 @@ void __init perf_event_init(void)
idr_init(&pmu_idr);
unwind_deferred_init(&perf_unwind_work,
perf_unwind_deferred_callback);
perf_event_init_all_cpus();
init_srcu_struct(&pmus_srcu);
perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);

View File

@ -463,7 +463,9 @@ struct perf_event_attr {
inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */
remove_on_exec : 1, /* event is removed from task on exec */
sigtrap : 1, /* send synchronous SIGTRAP on event */
__reserved_1 : 26;
defer_callchain: 1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */
defer_output : 1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */
__reserved_1 : 24;
union {
__u32 wakeup_events; /* wake up every n events */
@ -1239,6 +1241,22 @@ enum perf_event_type {
*/
PERF_RECORD_AUX_OUTPUT_HW_ID = 21,
/*
* This user callchain capture was deferred until shortly before
* returning to user space. Previous samples would have kernel
* callchains only and they need to be stitched with this to make full
* callchains.
*
* struct {
* struct perf_event_header header;
* u64 cookie;
* u64 nr;
* u64 ips[nr];
* struct sample_id sample_id;
* };
*/
PERF_RECORD_CALLCHAIN_DEFERRED = 22,
PERF_RECORD_MAX, /* non-ABI */
};
@ -1269,6 +1287,7 @@ enum perf_callchain_context {
PERF_CONTEXT_HV = (__u64)-32,
PERF_CONTEXT_KERNEL = (__u64)-128,
PERF_CONTEXT_USER = (__u64)-512,
PERF_CONTEXT_USER_DEFERRED = (__u64)-640,
PERF_CONTEXT_GUEST = (__u64)-2048,
PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176,