From e3223e1e9a9f7912763e314c9e8a8320c842be72 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 6 May 2025 11:16:48 -0400 Subject: tracing: Update function trace addresses with module addresses Now that module addresses are saved in the persistent ring buffer, their addresses can be used to adjust the address in the persistent ring buffer to the address of the module that is currently loaded. Instead of blindly using the text_delta that only works for core kernel code, call the trace_adjust_address() that will see if the address matches an address saved in the persistent ring buffer, and then uses that against the matching module if it is loaded. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250506111648.5df7f3ec@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_output.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index b9ab06c99543..aab6816f0249 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1086,11 +1086,11 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, } static void print_fn_trace(struct trace_seq *s, unsigned long ip, - unsigned long parent_ip, long delta, - unsigned long *args, int flags) + unsigned long parent_ip, unsigned long *args, + struct trace_array *tr, int flags) { - ip += delta; - parent_ip += delta; + ip = trace_adjust_address(tr, ip); + parent_ip = trace_adjust_address(tr, parent_ip); seq_print_ip_sym(s, ip, flags); if (args) @@ -1119,8 +1119,7 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, else args = NULL; - print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, - args, flags); + print_fn_trace(s, field->ip, field->parent_ip, args, iter->tr, flags); trace_seq_putc(s, '\n'); return trace_handle_return(s); @@ -1706,7 +1705,7 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - ip = field->ip + iter->tr->text_delta; + ip = trace_adjust_address(iter->tr, field->ip); seq_print_ip_sym(s, ip, flags); trace_seq_printf(s, ": %s", field->buf); @@ -1792,7 +1791,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, NULL, flags); + print_fn_trace(s, field->ip, field->parent_ip, NULL, iter->tr, flags); trace_seq_printf(s, " (repeats: %u, last_ts:", field->count); trace_print_time(s, iter, iter->ts - FUNC_REPEATS_GET_DELTA_TS(field)); -- cgit v1.2.3 From 531ee10b430eed70df360932086a2587d39b2bdf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 25 Mar 2025 17:38:47 -0400 Subject: tracing: Show function names when possible when listing fields When the "fields" option is enabled, the "print fmt" of the trace event is ignored and only the fields are printed. But some fields contain function pointers. Instead of just showing the hex value in this case, show the function name when possible: Instead of having: # echo 1 > options/fields # cat trace [..] kmem_cache_free: call_site=0xffffffffa9afcf31 (-1448095951) ptr=0xffff888124452910 (-131386736039664) name=kmemleak_object Have it output: kmem_cache_free: call_site=rcu_do_batch+0x3d1/0x14a0 (-1768960207) ptr=0xffff888132ea5ed0 (854220496) name=kmemleak_object Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250325213919.624181915@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_output.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index aab6816f0249..73037efdb45f 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1015,14 +1015,24 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c break; } - trace_seq_printf(&iter->seq, "0x%x (%d)", - *(unsigned int *)pos, - *(unsigned int *)pos); + if (sizeof(long) == 4) + trace_seq_printf(&iter->seq, "%pS (%d)", + *(void **)pos, + *(unsigned int *)pos); + else + trace_seq_printf(&iter->seq, "0x%x (%d)", + *(unsigned int *)pos, + *(unsigned int *)pos); break; case 8: - trace_seq_printf(&iter->seq, "0x%llx (%lld)", - *(unsigned long long *)pos, - *(unsigned long long *)pos); + if (sizeof(long) == 8) + trace_seq_printf(&iter->seq, "%pS (%lld)", + *(void **)pos, + *(unsigned long long *)pos); + else + trace_seq_printf(&iter->seq, "0x%llx (%lld)", + *(unsigned long long *)pos, + *(unsigned long long *)pos); break; default: trace_seq_puts(&iter->seq, ""); -- cgit v1.2.3 From 00d872dd541cdf22230510201a1baf58f0147db9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 6 May 2025 10:23:00 -0400 Subject: tracing: Only return an adjusted address if it matches the kernel address The trace_adjust_address() will take a given address and examine the persistent ring buffer to see if the address matches a module that is listed there. If it does not, it will just adjust the value to the core kernel delta. But if the address was for something that was not part of the core kernel text or data it should not be adjusted. Check the result of the adjustment and only return the adjustment if it lands in the current kernel text or data. If not, return the original address. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250506102300.0ba2f9e0@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5b8db27fb6ef..01572ef79802 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6032,6 +6032,7 @@ unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr) struct trace_module_delta *module_delta; struct trace_scratch *tscratch; struct trace_mod_entry *entry; + unsigned long raddr; int idx = 0, nr_entries; /* If we don't have last boot delta, return the address */ @@ -6045,7 +6046,9 @@ unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr) module_delta = READ_ONCE(tr->module_delta); if (!module_delta || !tscratch->nr_entries || tscratch->entries[0].mod_addr > addr) { - return addr + tr->text_delta; + raddr = addr + tr->text_delta; + return __is_kernel(raddr) || is_kernel_core_data(raddr) || + is_kernel_rodata(raddr) ? raddr : addr; } /* Note that entries must be sorted. */ -- cgit v1.2.3 From dc6a49d4cd2629859649883a84d6ee5a741ed03a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 25 Mar 2025 18:56:19 -0400 Subject: tracing: Adjust addresses for printing out fields Add adjustments to the values of the "fields" output if the buffer is a persistent ring buffer to adjust the addresses to both the kernel core and kernel modules if they match a module in the persistent memory and that module is also loaded. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Cc: Andrew Morton Link: https://lore.kernel.org/20250325185619.54b85587@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_output.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 73037efdb45f..e7ebad177679 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -938,6 +938,9 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c struct list_head *head) { struct ftrace_event_field *field; + struct trace_array *tr = iter->tr; + unsigned long long laddr; + unsigned long addr; int offset; int len; int ret; @@ -974,8 +977,8 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c case FILTER_PTR_STRING: if (!iter->fmt_size) trace_iter_expand_format(iter); - pos = *(void **)pos; - ret = strncpy_from_kernel_nofault(iter->fmt, pos, + addr = trace_adjust_address(tr, *(unsigned long *)pos); + ret = strncpy_from_kernel_nofault(iter->fmt, (void *)addr, iter->fmt_size); if (ret < 0) trace_seq_printf(&iter->seq, "(0x%px)", pos); @@ -984,8 +987,8 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c pos, iter->fmt); break; case FILTER_TRACE_FN: - pos = *(void **)pos; - trace_seq_printf(&iter->seq, "%pS", pos); + addr = trace_adjust_address(tr, *(unsigned long *)pos); + trace_seq_printf(&iter->seq, "%pS", (void *)addr); break; case FILTER_CPU: case FILTER_OTHER: @@ -1015,24 +1018,25 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c break; } - if (sizeof(long) == 4) + addr = *(unsigned int *)pos; + if (sizeof(long) == 4) { + addr = trace_adjust_address(tr, addr); trace_seq_printf(&iter->seq, "%pS (%d)", - *(void **)pos, - *(unsigned int *)pos); - else + (void *)addr, (int)addr); + } else { trace_seq_printf(&iter->seq, "0x%x (%d)", - *(unsigned int *)pos, - *(unsigned int *)pos); + (unsigned int)addr, (int)addr); + } break; case 8: - if (sizeof(long) == 8) + laddr = *(unsigned long long *)pos; + if (sizeof(long) == 8) { + laddr = trace_adjust_address(tr, (unsigned long)laddr); trace_seq_printf(&iter->seq, "%pS (%lld)", - *(void **)pos, - *(unsigned long long *)pos); - else - trace_seq_printf(&iter->seq, "0x%llx (%lld)", - *(unsigned long long *)pos, - *(unsigned long long *)pos); + (void *)(long)laddr, laddr); + } else { + trace_seq_printf(&iter->seq, "0x%llx (%lld)", laddr, laddr); + } break; default: trace_seq_puts(&iter->seq, ""); -- cgit v1.2.3 From 3e4b37160b43a24ceda672033864c594d1f5cd8b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 6 May 2025 10:51:31 -0400 Subject: tracing: Show preempt and irq events callsites from the offsets in field print When the "fields" option is set in a trace instance, it ignores the "print fmt" portion of the trace event and just prints the raw fields defined by the TP_STRUCT__entry() of the TRACE_EVENT() macro. The preempt_disable/enable and irq_disable/enable events record only the caller offset from _stext to save space in the ring buffer. Even though the "fields" option only prints the fields, it also tries to print what they represent too, which includes function names. Add a check in the output of the event field printing to see if the field name is "caller_offs" or "parent_offs" and then print the function at the offset from _stext of that field. Instead of just showing: irq_disable: caller_offs=0xba634d (12215117) parent_offs=0x39d10e2 (60625122) Show: irq_disable: caller_offs=trace_hardirqs_off.part.0+0xad/0x130 0xba634d (12215117) parent_offs=_raw_spin_lock_irqsave+0x62/0x70 0x39d10e2 (60625122) Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250506105131.4b6089a9@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_output.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index e7ebad177679..0b3db02030a7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1019,6 +1019,17 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c } addr = *(unsigned int *)pos; + + /* Some fields reference offset from _stext. */ + if (!strcmp(field->name, "caller_offs") || + !strcmp(field->name, "parent_offs")) { + unsigned long ip; + + ip = addr + (unsigned long)_stext; + ip = trace_adjust_address(tr, ip); + trace_seq_printf(&iter->seq, "%pS ", (void *)ip); + } + if (sizeof(long) == 4) { addr = trace_adjust_address(tr, addr); trace_seq_printf(&iter->seq, "%pS (%d)", -- cgit v1.2.3 From 872a0d90c1297a560b0fbee219dc5ef6eda9bcb4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Apr 2025 21:06:37 -0400 Subject: tracing: Always use memcpy() in histogram add_to_key() The add_to_key() function tests if the key is a string or some data. If it's a string it does some further calculations of the string size (still truncating it to the max size it can be), and calls strncpy(). If the key isn't as string it calls memcpy(). The interesting point is that both use the exact same parameters: strncpy(compound_key + key_field->offset, (char *)key, size); } else memcpy(compound_key + key_field->offset, key, size); As strncpy() is being used simply as a memcpy() for a string, and since strncpy() is deprecated, just call memcpy() for both memory and string keys. Cc: Mathieu Desnoyers Cc: Kees Cook Link: https://lore.kernel.org/20250403210637.1c477d4a@gandalf.local.home Acked-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1260c23cfa5f..e139b58c3a43 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5224,10 +5224,8 @@ static inline void add_to_key(char *compound_key, void *key, /* ensure NULL-termination */ if (size > key_field->size - 1) size = key_field->size - 1; - - strncpy(compound_key + key_field->offset, (char *)key, size); - } else - memcpy(compound_key + key_field->offset, key, size); + } + memcpy(compound_key + key_field->offset, key, size); } static void -- cgit v1.2.3 From 7ab0fc61ce73040f89b12d76a8279995ec283541 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Apr 2025 12:38:51 -0400 Subject: tracing: Move histogram trigger variables from stack to per CPU structure The histogram trigger has three somewhat large arrays on the kernel stack: unsigned long entries[HIST_STACKTRACE_DEPTH]; u64 var_ref_vals[TRACING_MAP_VARS_MAX]; char compound_key[HIST_KEY_SIZE_MAX]; Checking the function event_hist_trigger() stack frame size, it currently uses 816 bytes for its stack frame due to these variables! Instead, allocate a per CPU structure that holds these arrays for each context level (normal, softirq, irq and NMI). That is, each CPU will have 4 of these structures. This will be allocated when the first histogram trigger is enabled and freed when the last is disabled. When the histogram callback triggers, it will request this structure. The request will disable preemption, get the per CPU structure at the index of the per CPU variable, and increment that variable. The callback will use the arrays in this structure to perform its work and then release the structure. That in turn will simply decrement the per CPU index and enable preemption. Moving the variables from the kernel stack to the per CPU structure brings the stack frame of event_hist_trigger() down to just 112 bytes. Cc: Mathieu Desnoyers Cc: Tom Zanussi Link: https://lore.kernel.org/20250407123851.74ea8d58@gandalf.local.home Fixes: 067fe038e70f6 ("tracing: Add variable reference handling to hist triggers") Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 120 ++++++++++++++++++++++++++++++++++----- 1 file changed, 105 insertions(+), 15 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index e139b58c3a43..e85bc59c0421 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5244,17 +5244,94 @@ hist_trigger_actions(struct hist_trigger_data *hist_data, } } +/* + * The hist_pad structure is used to save information to create + * a histogram from the histogram trigger. It's too big to store + * on the stack, so when the histogram trigger is initialized + * a percpu array of 4 hist_pad structures is allocated. + * This will cover every context from normal, softirq, irq and NMI + * in the very unlikely event that a tigger happens at each of + * these contexts and interrupts a currently active trigger. + */ +struct hist_pad { + unsigned long entries[HIST_STACKTRACE_DEPTH]; + u64 var_ref_vals[TRACING_MAP_VARS_MAX]; + char compound_key[HIST_KEY_SIZE_MAX]; +}; + +static struct hist_pad __percpu *hist_pads; +static DEFINE_PER_CPU(int, hist_pad_cnt); +static refcount_t hist_pad_ref; + +/* One hist_pad for every context (normal, softirq, irq, NMI) */ +#define MAX_HIST_CNT 4 + +static int alloc_hist_pad(void) +{ + lockdep_assert_held(&event_mutex); + + if (refcount_read(&hist_pad_ref)) { + refcount_inc(&hist_pad_ref); + return 0; + } + + hist_pads = __alloc_percpu(sizeof(struct hist_pad) * MAX_HIST_CNT, + __alignof__(struct hist_pad)); + if (!hist_pads) + return -ENOMEM; + + refcount_set(&hist_pad_ref, 1); + return 0; +} + +static void free_hist_pad(void) +{ + lockdep_assert_held(&event_mutex); + + if (!refcount_dec_and_test(&hist_pad_ref)) + return; + + free_percpu(hist_pads); + hist_pads = NULL; +} + +static struct hist_pad *get_hist_pad(void) +{ + struct hist_pad *hist_pad; + int cnt; + + if (WARN_ON_ONCE(!hist_pads)) + return NULL; + + preempt_disable(); + + hist_pad = per_cpu_ptr(hist_pads, smp_processor_id()); + + if (this_cpu_read(hist_pad_cnt) == MAX_HIST_CNT) { + preempt_enable(); + return NULL; + } + + cnt = this_cpu_inc_return(hist_pad_cnt) - 1; + + return &hist_pad[cnt]; +} + +static void put_hist_pad(void) +{ + this_cpu_dec(hist_pad_cnt); + preempt_enable(); +} + static void event_hist_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe) { struct hist_trigger_data *hist_data = data->private_data; bool use_compound_key = (hist_data->n_keys > 1); - unsigned long entries[HIST_STACKTRACE_DEPTH]; - u64 var_ref_vals[TRACING_MAP_VARS_MAX]; - char compound_key[HIST_KEY_SIZE_MAX]; struct tracing_map_elt *elt = NULL; struct hist_field *key_field; + struct hist_pad *hist_pad; u64 field_contents; void *key = NULL; unsigned int i; @@ -5262,12 +5339,18 @@ static void event_hist_trigger(struct event_trigger_data *data, if (unlikely(!rbe)) return; - memset(compound_key, 0, hist_data->key_size); + hist_pad = get_hist_pad(); + if (!hist_pad) + return; + + memset(hist_pad->compound_key, 0, hist_data->key_size); for_each_hist_key_field(i, hist_data) { key_field = hist_data->fields[i]; if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { + unsigned long *entries = hist_pad->entries; + memset(entries, 0, HIST_STACKTRACE_SIZE); if (key_field->field) { unsigned long *stack, n_entries; @@ -5291,26 +5374,31 @@ static void event_hist_trigger(struct event_trigger_data *data, } if (use_compound_key) - add_to_key(compound_key, key, key_field, rec); + add_to_key(hist_pad->compound_key, key, key_field, rec); } if (use_compound_key) - key = compound_key; + key = hist_pad->compound_key; if (hist_data->n_var_refs && - !resolve_var_refs(hist_data, key, var_ref_vals, false)) - return; + !resolve_var_refs(hist_data, key, hist_pad->var_ref_vals, false)) + goto out; elt = tracing_map_insert(hist_data->map, key); if (!elt) - return; + goto out; - hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, var_ref_vals); + hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, hist_pad->var_ref_vals); - if (resolve_var_refs(hist_data, key, var_ref_vals, true)) - hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals); + if (resolve_var_refs(hist_data, key, hist_pad->var_ref_vals, true)) { + hist_trigger_actions(hist_data, elt, buffer, rec, rbe, + key, hist_pad->var_ref_vals); + } hist_poll_wakeup(); + + out: + put_hist_pad(); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -6155,6 +6243,9 @@ static int event_hist_trigger_init(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; + if (alloc_hist_pad() < 0) + return -ENOMEM; + if (!data->ref && hist_data->attrs->name) save_named_trigger(hist_data->attrs->name, data); @@ -6199,6 +6290,7 @@ static void event_hist_trigger_free(struct event_trigger_data *data) destroy_hist_data(hist_data); } + free_hist_pad(); } static const struct event_trigger_ops event_hist_trigger_ops = { @@ -6214,9 +6306,7 @@ static int event_hist_trigger_named_init(struct event_trigger_data *data) save_named_trigger(data->named_data->name, data); - event_hist_trigger_init(data->named_data); - - return 0; + return event_hist_trigger_init(data->named_data); } static void event_hist_trigger_named_free(struct event_trigger_data *data) -- cgit v1.2.3 From 54c53dfdb681f65e0cc65ddb1e05d145fea5ae60 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Apr 2025 15:49:12 -0400 Subject: tracing: Add common_comm to histograms If one wants to trace the name of the task that wakes up a process and pass that to the synthetic events, there's nothing currently that lets the synthetic events do that. Add a "common_comm" to the histogram logic that allows histograms save the current->comm as a variable that can be passed through and added to a synthetic event: # cd /sys/kernel/tracing # echo 's:wake_lat char[] waker; char[] wakee; u64 delta;' >> dynamic_events # echo 'hist:keys=pid:comm=common_comm:ts=common_timestamp.usecs if !(common_flags & 0x18)' > events/sched/sched_waking/trigger # echo 'hist:keys=next_pid:wake_comm=$comm:delta=common_timestamp.usecs-$ts:onmatch(sched.sched_waking).trace(wake_lat,$wake_comm,next_comm,$delta)' > events/sched/sched_switch/trigger The above will create a synthetic trace event that will save both the name of the waker and the wakee but only if the wakeup did not happen in a hard or soft interrupt context. The "common_comm" is used to save the task->comm at the time of the initial event and is passed via the "comm" variable to the second event, and that is saved as the "waker" field in the "wake_lat" synthetic event. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250407154912.3c6c6246@gandalf.local.home Acked-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 51 +++++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index e85bc59c0421..58c9535f61df 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -114,6 +114,7 @@ enum hist_field_fn { HIST_FIELD_FN_BUCKET, HIST_FIELD_FN_TIMESTAMP, HIST_FIELD_FN_CPU, + HIST_FIELD_FN_COMM, HIST_FIELD_FN_STRING, HIST_FIELD_FN_DYNSTRING, HIST_FIELD_FN_RELDYNSTRING, @@ -506,6 +507,7 @@ enum hist_field_flags { HIST_FIELD_FL_CONST = 1 << 18, HIST_FIELD_FL_PERCENT = 1 << 19, HIST_FIELD_FL_GRAPH = 1 << 20, + HIST_FIELD_FL_COMM = 1 << 21, }; struct var_defs { @@ -885,6 +887,15 @@ static u64 hist_field_cpu(struct hist_field *hist_field, return cpu; } +static u64 hist_field_comm(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + return (u64)(unsigned long)current->comm; +} + /** * check_field_for_var_ref - Check if a VAR_REF field references a variable * @hist_field: The VAR_REF field to check @@ -1338,6 +1349,8 @@ static const char *hist_field_name(struct hist_field *field, field_name = hist_field_name(field->operands[0], ++level); else if (field->flags & HIST_FIELD_FL_CPU) field_name = "common_cpu"; + else if (field->flags & HIST_FIELD_FL_COMM) + field_name = "common_comm"; else if (field->flags & HIST_FIELD_FL_EXPR || field->flags & HIST_FIELD_FL_VAR_REF) { if (field->system) { @@ -2015,6 +2028,13 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto out; } + if (flags & HIST_FIELD_FL_COMM) { + hist_field->fn_num = HIST_FIELD_FN_COMM; + hist_field->size = MAX_FILTER_STR_VAL; + hist_field->type = "char[]"; + goto out; + } + if (WARN_ON_ONCE(!field)) goto out; @@ -2359,9 +2379,11 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, hist_data->attrs->ts_in_usecs = true; } else if (strcmp(field_name, "common_stacktrace") == 0) { *flags |= HIST_FIELD_FL_STACKTRACE; - } else if (strcmp(field_name, "common_cpu") == 0) + } else if (strcmp(field_name, "common_cpu") == 0) { *flags |= HIST_FIELD_FL_CPU; - else if (strcmp(field_name, "hitcount") == 0) + } else if (strcmp(field_name, "common_comm") == 0) { + *flags |= HIST_FIELD_FL_COMM | HIST_FIELD_FL_STRING; + } else if (strcmp(field_name, "hitcount") == 0) *flags |= HIST_FIELD_FL_HITCOUNT; else { field = trace_find_event_field(file->event_call, field_name); @@ -2377,6 +2399,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, *flags |= HIST_FIELD_FL_CPU; } else if (field && field->filter_type == FILTER_STACKTRACE) { *flags |= HIST_FIELD_FL_STACKTRACE; + } else if (field && field->filter_type == FILTER_COMM) { + *flags |= HIST_FIELD_FL_COMM | HIST_FIELD_FL_STRING; } else { hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); @@ -4327,6 +4351,8 @@ static u64 hist_fn_call(struct hist_field *hist_field, return hist_field_timestamp(hist_field, elt, buffer, rbe, event); case HIST_FIELD_FN_CPU: return hist_field_cpu(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_COMM: + return hist_field_comm(hist_field, elt, buffer, rbe, event); case HIST_FIELD_FN_STRING: return hist_field_string(hist_field, elt, buffer, rbe, event); case HIST_FIELD_FN_DYNSTRING: @@ -5212,14 +5238,19 @@ static inline void add_to_key(char *compound_key, void *key, size_t size = key_field->size; if (key_field->flags & HIST_FIELD_FL_STRING) { - struct ftrace_event_field *field; - field = key_field->field; - if (field->filter_type == FILTER_DYN_STRING || - field->filter_type == FILTER_RDYN_STRING) - size = *(u32 *)(rec + field->offset) >> 16; - else if (field->filter_type == FILTER_STATIC_STRING) - size = field->size; + if (key_field->flags & HIST_FIELD_FL_COMM) { + size = strlen((char *)key); + } else { + struct ftrace_event_field *field; + + field = key_field->field; + if (field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING) + size = *(u32 *)(rec + field->offset) >> 16; + else if (field->filter_type == FILTER_STATIC_STRING) + size = field->size; + } /* ensure NULL-termination */ if (size > key_field->size - 1) @@ -6097,6 +6128,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) if (hist_field->flags & HIST_FIELD_FL_CPU) seq_puts(m, "common_cpu"); + if (hist_field->flags & HIST_FIELD_FL_COMM) + seq_puts(m, "common_comm"); else if (hist_field->flags & HIST_FIELD_FL_CONST) seq_printf(m, "%llu", hist_field->constant); else if (field_name) { -- cgit v1.2.3 From 88cefd99ee265b18f851b911a75282146ecabc3d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Apr 2025 15:38:30 -0400 Subject: ftrace: Show subops in enabled_functions The function graph infrastructure uses subops of the function tracer. These are not shown in enabled_functions. Add a "subops:" section to the enabled_functions line to show what functions are attached via subops. If the subops is from the function_graph infrastructure, then show the entry and return callbacks that are attached. Here's an example of the output: schedule_on_each_cpu (1) tramp: 0xffffffffc03ef000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60 subops: {ent:trace_graph_entry+0x0/0x20 ret:trace_graph_return+0x0/0x150} Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250410153830.5d97f108@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 2 ++ kernel/trace/fgraph.c | 2 ++ kernel/trace/ftrace.c | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index fbabc3d848b3..fc939ca2ff66 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -328,6 +328,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); * DIRECT - Used by the direct ftrace_ops helper for direct functions * (internal ftrace only, should not be used by others) * SUBOP - Is controlled by another op in field managed. + * GRAPH - Is a component of the fgraph_ops structure */ enum { FTRACE_OPS_FL_ENABLED = BIT(0), @@ -349,6 +350,7 @@ enum { FTRACE_OPS_FL_PERMANENT = BIT(16), FTRACE_OPS_FL_DIRECT = BIT(17), FTRACE_OPS_FL_SUBOP = BIT(18), + FTRACE_OPS_FL_GRAPH = BIT(19), }; #ifndef CONFIG_DYNAMIC_FTRACE_WITH_ARGS diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 8d925cbdce3a..c5b207992fb4 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1382,6 +1382,8 @@ int register_ftrace_graph(struct fgraph_ops *gops) /* Always save the function, and reset at unregistering */ gops->saved_func = gops->entryfunc; + gops->ops.flags |= FTRACE_OPS_FL_GRAPH; + ret = ftrace_startup_subops(&graph_ops, &gops->ops, command); if (!ret) fgraph_array[i] = gops; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6981830c3128..014cd2cedae3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4373,6 +4373,42 @@ static inline int print_rec(struct seq_file *m, unsigned long ip) } #endif +static void print_subops(struct seq_file *m, struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + struct ftrace_ops *subops; + bool first = true; + + list_for_each_entry(subops, &ops->subop_list, list) { + if (!((subops->flags & FTRACE_OPS_FL_ENABLED) && + hash_contains_ip(rec->ip, subops->func_hash))) + continue; + if (first) { + seq_printf(m, "\tsubops:"); + first = false; + } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (subops->flags & FTRACE_OPS_FL_GRAPH) { + struct fgraph_ops *gops; + + gops = container_of(subops, struct fgraph_ops, ops); + seq_printf(m, " {ent:%pS ret:%pS}", + (void *)gops->entryfunc, + (void *)gops->retfunc); + continue; + } +#endif + if (subops->trampoline) { + seq_printf(m, " {%pS (%pS)}", + (void *)subops->trampoline, + (void *)subops->func); + add_trampoline_func(m, subops, rec); + } else { + seq_printf(m, " {%pS}", + (void *)subops->func); + } + } +} + static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; @@ -4425,6 +4461,7 @@ static int t_show(struct seq_file *m, void *v) (void *)ops->trampoline, (void *)ops->func); add_trampoline_func(m, ops, rec); + print_subops(m, ops, rec); ops = ftrace_find_tramp_ops_next(rec, ops); } while (ops); } else @@ -4437,6 +4474,7 @@ static int t_show(struct seq_file *m, void *v) if (ops) { seq_printf(m, "\tops: %pS (%pS)", ops, ops->func); + print_subops(m, ops, rec); } else { seq_puts(m, "\tops: ERROR!"); } -- cgit v1.2.3 From 761ef34228222686f9f8c5fccd258e17a0c3d6de Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Sun, 13 Apr 2025 00:10:44 +0200 Subject: ftrace: Expose call graph depth as unsigned int Depth is stored as int because the code uses negative values to break out of iterations. But what is recorded is always zero or positive. So expose it as unsigned int instead of int. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Sven Schnelle Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Link: https://lore.kernel.org/20250412221847.17310-3-iii@linux.ibm.com Signed-off-by: Ilya Leoshkevich Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_entries.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 4ef4df6623a8..de294ae2c5c5 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -97,11 +97,11 @@ FTRACE_ENTRY_PACKED(fgraph_retaddr_entry, fgraph_retaddr_ent_entry, F_STRUCT( __field_struct( struct fgraph_retaddr_ent, graph_ent ) __field_packed( unsigned long, graph_ent, func ) - __field_packed( int, graph_ent, depth ) + __field_packed( unsigned int, graph_ent, depth ) __field_packed( unsigned long, graph_ent, retaddr ) ), - F_printk("--> %ps (%d) <- %ps", (void *)__entry->func, __entry->depth, + F_printk("--> %ps (%u) <- %ps", (void *)__entry->func, __entry->depth, (void *)__entry->retaddr) ); @@ -124,13 +124,13 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, __field_struct( struct ftrace_graph_ret, ret ) __field_packed( unsigned long, ret, func ) __field_packed( unsigned long, ret, retval ) - __field_packed( int, ret, depth ) + __field_packed( unsigned int, ret, depth ) __field_packed( unsigned int, ret, overrun ) __field(unsigned long long, calltime ) __field(unsigned long long, rettime ) ), - F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx", + F_printk("<-- %ps (%u) (start: %llx end: %llx) over: %u retval: %lx", (void *)__entry->func, __entry->depth, __entry->calltime, __entry->rettime, __entry->depth, __entry->retval) @@ -146,13 +146,13 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, F_STRUCT( __field_struct( struct ftrace_graph_ret, ret ) __field_packed( unsigned long, ret, func ) - __field_packed( int, ret, depth ) + __field_packed( unsigned int, ret, depth ) __field_packed( unsigned int, ret, overrun ) __field(unsigned long long, calltime ) __field(unsigned long long, rettime ) ), - F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", + F_printk("<-- %ps (%u) (start: %llx end: %llx) over: %u", (void *)__entry->func, __entry->depth, __entry->calltime, __entry->rettime, __entry->depth) -- cgit v1.2.3 From a54665ab7c20081cb6299d79dcab9960752bc741 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 16 Apr 2025 16:54:20 -0400 Subject: ftrace: Comment that ftrace_func_mapper is freed with free_ftrace_hash() The structure ftrace_func_mapper only contains a single field and that is a ftrace_hash. It is used to abstract it out from a normal hash to control users of how it gets modified. The freeing of a ftrace_func_mapper structure is: free_ftrace_hash(&mapper->hash); Without context, this looks like a bug. It should be commented that it is not a bug and it is freed this way. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Link: https://lore.kernel.org/20250416165420.5c717420@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 014cd2cedae3..1af952cba48d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5208,8 +5208,12 @@ struct ftrace_func_map { void *data; }; +/* + * Note, ftrace_func_mapper is freed by free_ftrace_hash(&mapper->hash). + * The hash field must be the first field. + */ struct ftrace_func_mapper { - struct ftrace_hash hash; + struct ftrace_hash hash; /* Must be first! */ }; /** @@ -5344,6 +5348,7 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, } } } + /* This also frees the mapper itself */ free_ftrace_hash(&mapper->hash); } -- cgit v1.2.3 From 17f89102fe23d7389085a8820550df688f79888a Mon Sep 17 00:00:00 2001 From: Tomas Glozar Date: Fri, 25 Apr 2025 11:18:39 +0200 Subject: tracing/osnoise: Allow arbitrarily long CPU string Allocate kernel memory for processing CPU string (/sys/kernel/tracing/osnoise/cpus) also in osnoise_cpus_write to allow the writing of a CPU string of an arbitrary length. This replaces the 256-byte buffer, which is insufficient with the rising number of CPUs. For example, if I wanted to measure on every even CPU on a system with 256 CPUs, the string would be 456 characters long. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250425091839.343289-1-tglozar@redhat.com Signed-off-by: Tomas Glozar Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index e732c9e37e14..6819b93309ce 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2302,7 +2302,7 @@ osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count, * osnoise_cpus_write - Write function for "cpus" entry * @filp: The active open file structure * @ubuf: The user buffer that contains the value to write - * @cnt: The maximum number of bytes to write to "file" + * @count: The maximum number of bytes to write to "file" * @ppos: The current position in @file * * This function provides a write implementation for the "cpus" @@ -2320,10 +2320,11 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, { cpumask_var_t osnoise_cpumask_new; int running, err; - char buf[256]; + char *buf __free(kfree) = NULL; - if (count >= 256) - return -EINVAL; + buf = kmalloc(count, GFP_KERNEL); + if (!buf) + return -ENOMEM; if (copy_from_user(buf, ubuf, count)) return -EFAULT; -- cgit v1.2.3 From 6936298393d8d8bc3cec6b704f6a774162cf9bd3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 5 May 2025 17:21:05 -0400 Subject: tracing/mmiotrace: Remove reference to unused per CPU data pointer The mmiotracer referenced the per CPU array_buffer->data descriptor but never actually used it. Remove the references to it. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250505212234.696945463@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_mmiotrace.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index ba5858866b2f..c706544be60c 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -291,7 +291,6 @@ __init static int init_mmio_trace(void) device_initcall(init_mmio_trace); static void __trace_mmiotrace_rw(struct trace_array *tr, - struct trace_array_cpu *data, struct mmiotrace_rw *rw) { struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -315,12 +314,10 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, void mmio_trace_rw(struct mmiotrace_rw *rw) { struct trace_array *tr = mmio_trace_array; - struct trace_array_cpu *data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id()); - __trace_mmiotrace_rw(tr, data, rw); + __trace_mmiotrace_rw(tr, rw); } static void __trace_mmiotrace_map(struct trace_array *tr, - struct trace_array_cpu *data, struct mmiotrace_map *map) { struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -344,12 +341,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, void mmio_trace_mapping(struct mmiotrace_map *map) { struct trace_array *tr = mmio_trace_array; - struct trace_array_cpu *data; - - preempt_disable(); - data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id()); - __trace_mmiotrace_map(tr, data, map); - preempt_enable(); + __trace_mmiotrace_map(tr, map); } int mmio_trace_printk(const char *fmt, va_list args) -- cgit v1.2.3 From c638ebd8232e69daed2f2573365c48cd1e064a89 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 5 May 2025 17:21:06 -0400 Subject: ftrace: Do not bother checking per CPU "disabled" flag The per CPU "disabled" value was the original way to disable tracing when the tracing subsystem was first created. Today, the ring buffer infrastructure has its own way to disable tracing. In fact, things have changed so much since 2008 that many things ignore the disable flag. There's no reason for the function tracer to check it, if tracing is disabled, the ring buffer will not record the event anyway. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250505212234.868972758@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 98ccf3f00c51..bd153219a712 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -209,7 +209,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = op->private; - struct trace_array_cpu *data; unsigned int trace_ctx; int bit; @@ -224,9 +223,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, trace_ctx = tracing_gen_ctx_dec(); - data = this_cpu_ptr(tr->array_buffer.data); - if (!atomic_read(&data->disabled)) - trace_function(tr, ip, parent_ip, trace_ctx, NULL); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); ftrace_test_recursion_unlock(bit); } @@ -236,10 +233,8 @@ function_args_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = op->private; - struct trace_array_cpu *data; unsigned int trace_ctx; int bit; - int cpu; if (unlikely(!tr->function_enabled)) return; @@ -250,10 +245,7 @@ function_args_trace_call(unsigned long ip, unsigned long parent_ip, trace_ctx = tracing_gen_ctx(); - cpu = smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); - if (!atomic_read(&data->disabled)) - trace_function(tr, ip, parent_ip, trace_ctx, fregs); + trace_function(tr, ip, parent_ip, trace_ctx, fregs); ftrace_test_recursion_unlock(bit); } @@ -352,7 +344,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, { struct trace_func_repeats *last_info; struct trace_array *tr = op->private; - struct trace_array_cpu *data; unsigned int trace_ctx; int bit; @@ -364,8 +355,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, return; parent_ip = function_get_true_parent_ip(parent_ip, fregs); - data = this_cpu_ptr(tr->array_buffer.data); - if (atomic_read(&data->disabled)) + if (!tracer_tracing_is_on(tr)) goto out; /* -- cgit v1.2.3 From 1577683a925f63862e81d27246a3a0bc1f6e39ad Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 5 May 2025 17:21:07 -0400 Subject: tracing: Just use this_cpu_read() to access ignore_pid The ignore_pid boolean on the per CPU data descriptor is updated at sched_switch when a new task is scheduled in. If the new task is to be ignored, it is set to true, otherwise it is set to false. The current task should always have the correct value as it is updated when the task is scheduled in. Instead of breaking up the read of this value, which requires preemption to be disabled, just use this_cpu_read() which gives a snapshot of the value. Since the value will always be correct for a given task (because it's updated at sched switch) it doesn't need preemption disabled. This will also allow trace events to be called with preemption enabled. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250505212235.038958766@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 069e92856bda..fe0ea14d809e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -622,7 +622,6 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init); bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) { struct trace_array *tr = trace_file->tr; - struct trace_array_cpu *data; struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; @@ -632,9 +631,11 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) if (!pid_list && !no_pid_list) return false; - data = this_cpu_ptr(tr->array_buffer.data); - - return data->ignore_pid; + /* + * This is recorded at every sched_switch for this task. + * Thus, even if the task migrates the ignore value will be the same. + */ + return this_cpu_read(tr->array_buffer.data->ignore_pid) != 0; } EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid); -- cgit v1.2.3 From dbecef68ad33ddc8fe76a276eeb914d51d29b4c4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 5 May 2025 17:21:08 -0400 Subject: tracing: Add tracer_tracing_disable/enable() functions Allow a tracer to disable writing to its buffer for a temporary amount of time and re-enable it. The tracer_tracing_disable() will disable writing to the trace array buffer, and requires a tracer_tracing_enable() to re-enable it. The difference between tracer_tracing_disable() and tracer_tracing_off() is that the disable version can nest, and requires as many enable() calls as disable() calls to re-enable the buffer. Where as the off() function can be called multiple times and only requires a singe tracer_tracing_on() to re-enable the buffer. Cc: Jason Wessel Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Daniel Thompson Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/20250505212235.210330010@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 33 +++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 2 ++ 2 files changed, 35 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 01572ef79802..b691af1c1b5a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1582,6 +1582,39 @@ void tracer_tracing_off(struct trace_array *tr) smp_wmb(); } +/** + * tracer_tracing_disable() - temporary disable the buffer from write + * @tr: The trace array to disable its buffer for + * + * Expects trace_tracing_enable() to re-enable tracing. + * The difference between this and tracer_tracing_off() is that this + * is a counter and can nest, whereas, tracer_tracing_off() can + * be called multiple times and a single trace_tracing_on() will + * enable it. + */ +void tracer_tracing_disable(struct trace_array *tr) +{ + if (WARN_ON_ONCE(!tr->array_buffer.buffer)) + return; + + ring_buffer_record_disable(tr->array_buffer.buffer); +} + +/** + * tracer_tracing_enable() - counter part of tracer_tracing_disable() + * @tr: The trace array that had tracer_tracincg_disable() called on it + * + * This is called after tracer_tracing_disable() has been called on @tr, + * when it's safe to re-enable tracing. + */ +void tracer_tracing_enable(struct trace_array *tr) +{ + if (WARN_ON_ONCE(!tr->array_buffer.buffer)) + return; + + ring_buffer_record_enable(tr->array_buffer.buffer); +} + /** * tracing_off - turn off tracing buffers * diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 79be1995db44..74f1fe5788d4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -665,6 +665,8 @@ bool tracing_is_disabled(void); bool tracer_tracing_is_on(struct trace_array *tr); void tracer_tracing_on(struct trace_array *tr); void tracer_tracing_off(struct trace_array *tr); +void tracer_tracing_disable(struct trace_array *tr); +void tracer_tracing_enable(struct trace_array *tr); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, -- cgit v1.2.3 From 6ba3e0533fa63bdeb5b3c4c9688a4ecba2705722 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 5 May 2025 17:21:09 -0400 Subject: tracing: Use tracer_tracing_disable() instead of "disabled" field for ftrace_dump_one() The per CPU "disabled" value was the original way to disable tracing when the tracing subsystem was first created. Today, the ring buffer infrastructure has its own way to disable tracing. In fact, things have changed so much since 2008 that many things ignore the disable flag. The ftrace_dump_one() function iterates over all the current tracing CPUs and increments the "disabled" counter before doing the dump, and decrements it afterward. As the disabled flag can be ignored, doing this today is not reliable. Instead use the new tracer_tracing_disable() that calls into the ring buffer code to do the disabling. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250505212235.381188238@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b691af1c1b5a..bb514e988f22 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10484,7 +10484,7 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m static struct trace_iterator iter; unsigned int old_userobj; unsigned long flags; - int cnt = 0, cpu; + int cnt = 0; /* * Always turn off tracing when we dump. @@ -10501,9 +10501,8 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m /* Simulate the iterator */ trace_init_iter(&iter, tr); - for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } + /* While dumping, do not allow the buffer to be enable */ + tracer_tracing_disable(tr); old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ; @@ -10562,9 +10561,7 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m tr->trace_flags |= old_userobj; - for_each_tracing_cpu(cpu) { - atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } + tracer_tracing_enable(tr); local_irq_restore(flags); } -- cgit v1.2.3 From a9839d204896c59b0e2ae08e571515b1cf752bd1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 5 May 2025 17:21:10 -0400 Subject: tracing: kdb: Use tracer_tracing_on/off() instead of setting per CPU disabled The per CPU "disabled" value was the original way to disable tracing when the tracing subsystem was first created. Today, the ring buffer infrastructure has its own way to disable tracing. In fact, things have changed so much since 2008 that many things ignore the disable flag. The kdb_ftdump() function iterates over all the current tracing CPUs and increments the "disabled" counter before doing the dump, and decrements it afterward. As the disabled flag can be ignored, doing this today is not reliable. Instead, simply call tracer_tracing_off() and then tracer_tracing_on() to disable and then enabled the entire ring buffer in one go! Cc: Jason Wessel Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Daniel Thompson Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/20250505212235.549033722@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_kdb.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 1e72d20b3c2f..d7b135de958a 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -98,7 +98,6 @@ static int kdb_ftdump(int argc, const char **argv) long cpu_file; int err; int cnt; - int cpu; if (argc > 2) return KDB_ARGCOUNT; @@ -120,9 +119,7 @@ static int kdb_ftdump(int argc, const char **argv) trace_init_global_iter(&iter); iter.buffer_iter = buffer_iter; - for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } + tracer_tracing_disable(iter.tr); /* A negative skip_entries means skip all but the last entries */ if (skip_entries < 0) { @@ -135,9 +132,7 @@ static int kdb_ftdump(int argc, const char **argv) ftrace_dump_buf(skip_entries, cpu_file); - for_each_tracing_cpu(cpu) { - atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } + tracer_tracing_enable(iter.tr); kdb_trap_printk--; -- cgit v1.2.3 From f62e3de375150210335a063605ce0dd6a6746b78 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 5 May 2025 17:21:11 -0400 Subject: ftrace: Do not disabled function graph based on "disabled" field The per CPU "disabled" value was the original way to disable tracing when the tracing subsystem was first created. Today, the ring buffer infrastructure has its own way to disable tracing. In fact, things have changed so much since 2008 that many things ignore the disable flag. Do not bother disabling the function graph tracer if the per CPU disabled field is set. Just record as normal. If tracing is disabled in the ring buffer it will not be recorded. Also, when tracing is enabled again, it will not drop the return call of the function. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250505212235.715752008@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions_graph.c | 38 +++++++++--------------------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0c357a89c58e..9234e2c39abf 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -202,12 +202,9 @@ static int graph_entry(struct ftrace_graph_ent *trace, { unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; - struct trace_array_cpu *data; struct fgraph_times *ftimes; unsigned int trace_ctx; - long disabled; int ret = 0; - int cpu; if (*task_var & TRACE_GRAPH_NOTRACE) return 0; @@ -257,21 +254,14 @@ static int graph_entry(struct ftrace_graph_ent *trace, if (tracing_thresh) return 1; - preempt_disable_notrace(); - cpu = raw_smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_read(&data->disabled); - if (likely(!disabled)) { - trace_ctx = tracing_gen_ctx(); - if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && - tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR)) { - unsigned long retaddr = ftrace_graph_top_ret_addr(current); - ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr); - } else { - ret = __graph_entry(tr, trace, trace_ctx, fregs); - } + trace_ctx = tracing_gen_ctx(); + if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && + tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR)) { + unsigned long retaddr = ftrace_graph_top_ret_addr(current); + ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr); + } else { + ret = __graph_entry(tr, trace, trace_ctx, fregs); } - preempt_enable_notrace(); return ret; } @@ -351,13 +341,10 @@ void trace_graph_return(struct ftrace_graph_ret *trace, { unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; - struct trace_array_cpu *data; struct fgraph_times *ftimes; unsigned int trace_ctx; u64 calltime, rettime; - long disabled; int size; - int cpu; rettime = trace_clock_local(); @@ -376,15 +363,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace, calltime = ftimes->calltime; - preempt_disable_notrace(); - cpu = raw_smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_read(&data->disabled); - if (likely(!disabled)) { - trace_ctx = tracing_gen_ctx(); - __trace_graph_return(tr, trace, trace_ctx, calltime, rettime); - } - preempt_enable_notrace(); + trace_ctx = tracing_gen_ctx(); + __trace_graph_return(tr, trace, trace_ctx, calltime, rettime); } static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, -- cgit v1.2.3 From 969043af1590d22c6de550bb26ee465de72330b3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 5 May 2025 17:21:12 -0400 Subject: tracing: Do not use per CPU array_buffer.data->disabled for cpumask The per CPU "disabled" value was the original way to disable tracing when the tracing subsystem was first created. Today, the ring buffer infrastructure has its own way to disable tracing. In fact, things have changed so much since 2008 that many things ignore the disable flag. Do not bother setting the per CPU disabled flag of the array_buffer data to use to determine what CPUs can write to the buffer and only rely on the ring buffer code itself to disabled it. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250505212235.885452497@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bb514e988f22..0cd681516438 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5081,7 +5081,6 @@ int tracing_set_cpumask(struct trace_array *tr, */ if (cpumask_test_cpu(cpu, tr->tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu); @@ -5089,7 +5088,6 @@ int tracing_set_cpumask(struct trace_array *tr, } if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu); -- cgit v1.2.3 From 092a38565ed87cbda6fe7dd16c742df6f907483e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 5 May 2025 17:21:13 -0400 Subject: ring-buffer: Add ring_buffer_record_is_on_cpu() Add the function ring_buffer_record_is_on_cpu() that returns true if the ring buffer for a give CPU is writable and false otherwise. Also add tracer_tracing_is_on_cpu() to return if the ring buffer for a given CPU is writeable for a given trace_array. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250505212236.059853898@goodmis.org Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 1 + kernel/trace/ring_buffer.c | 18 ++++++++++++++++++ kernel/trace/trace.h | 15 +++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 56e27263acf8..cd7f0ae26615 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -192,6 +192,7 @@ void ring_buffer_record_off(struct trace_buffer *buffer); void ring_buffer_record_on(struct trace_buffer *buffer); bool ring_buffer_record_is_on(struct trace_buffer *buffer); bool ring_buffer_record_is_set_on(struct trace_buffer *buffer); +bool ring_buffer_record_is_on_cpu(struct trace_buffer *buffer, int cpu); void ring_buffer_record_disable_cpu(struct trace_buffer *buffer, int cpu); void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c0f877d39a24..1ca482955dae 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4882,6 +4882,24 @@ bool ring_buffer_record_is_set_on(struct trace_buffer *buffer) return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF); } +/** + * ring_buffer_record_is_on_cpu - return true if the ring buffer can write + * @buffer: The ring buffer to see if write is enabled + * @cpu: The CPU to test if the ring buffer can write too + * + * Returns true if the ring buffer is in a state that it accepts writes + * for a particular CPU. + */ +bool ring_buffer_record_is_on_cpu(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + cpu_buffer = buffer->buffers[cpu]; + + return ring_buffer_record_is_set_on(buffer) && + !atomic_read(&cpu_buffer->record_disabled); +} + /** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer * @buffer: The ring buffer to stop writes to. diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 74f1fe5788d4..69c1ecfb2290 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -673,6 +673,21 @@ struct dentry *trace_create_file(const char *name, void *data, const struct file_operations *fops); + +/** + * tracer_tracing_is_on_cpu - show real state of ring buffer enabled on for a cpu + * @tr : the trace array to know if ring buffer is enabled + * @cpu: The cpu buffer to check if enabled + * + * Shows real state of the per CPU buffer if it is enabled or not. + */ +static inline bool tracer_tracing_is_on_cpu(struct trace_array *tr, int cpu) +{ + if (tr->array_buffer.buffer) + return ring_buffer_record_is_on_cpu(tr->array_buffer.buffer, cpu); + return false; +} + int tracing_init_dentry(void); struct ring_buffer_event; -- cgit v1.2.3 From cf64792f0adb9488b66da1d8e1805d51bdcebeab Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 5 May 2025 17:21:14 -0400 Subject: tracing: branch: Use trace_tracing_is_on_cpu() instead of "disabled" field The branch tracer currently checks the per CPU "disabled" field to know if tracing is enabled or not for the CPU. As the "disabled" value is not used anymore to turn of tracing generically, use tracing_tracer_is_on_cpu() instead. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250505212236.224658526@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_branch.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 6d08a5523ce0..6809b370e991 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -32,7 +32,6 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) { struct trace_array *tr = branch_tracer; struct trace_buffer *buffer; - struct trace_array_cpu *data; struct ring_buffer_event *event; struct trace_branch *entry; unsigned long flags; @@ -54,8 +53,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) raw_local_irq_save(flags); current->trace_recursion |= TRACE_BRANCH_BIT; - data = this_cpu_ptr(tr->array_buffer.data); - if (atomic_read(&data->disabled)) + if (!tracer_tracing_is_on_cpu(tr, raw_smp_processor_id())) goto out; trace_ctx = tracing_gen_ctx_flags(flags); -- cgit v1.2.3 From 90633c34c36d0c15c9da4e19b2ceb46cab137478 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 5 May 2025 17:21:15 -0400 Subject: tracing: Convert the per CPU "disabled" counter to local from atomic The per CPU "disabled" counter is used for the latency tracers and stack tracers to make sure that their accounting isn't messed up by an NMI or interrupt coming in and affecting the same CPU data. But the counter is an atomic_t type. As it only needs to synchronize against the current CPU, switch it over to local_t type. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250505212236.394925376@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 2 +- kernel/trace/trace_functions.c | 8 ++++---- kernel/trace/trace_irqsoff.c | 22 +++++++++++----------- kernel/trace/trace_sched_wakeup.c | 18 +++++++++--------- 4 files changed, 25 insertions(+), 25 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 69c1ecfb2290..188032d4ab69 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -183,7 +183,7 @@ struct trace_array; * the trace, etc.) */ struct trace_array_cpu { - atomic_t disabled; + local_t disabled; void *buffer_page; /* ring buffer spare */ unsigned long entries; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index bd153219a712..99a90f182485 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -291,7 +291,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, parent_ip = function_get_true_parent_ip(parent_ip, fregs); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&data->disabled); + disabled = local_inc_return(&data->disabled); if (likely(disabled == 1)) { trace_ctx = tracing_gen_ctx_flags(flags); @@ -303,7 +303,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, __trace_stack(tr, trace_ctx, skip); } - atomic_dec(&data->disabled); + local_dec(&data->disabled); local_irq_restore(flags); } @@ -402,7 +402,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, parent_ip = function_get_true_parent_ip(parent_ip, fregs); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&data->disabled); + disabled = local_inc_return(&data->disabled); if (likely(disabled == 1)) { last_info = per_cpu_ptr(tr->last_func_repeats, cpu); @@ -417,7 +417,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, } out: - atomic_dec(&data->disabled); + local_dec(&data->disabled); local_irq_restore(flags); } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 40c39e946940..0b6d932a931e 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -123,12 +123,12 @@ static int func_prolog_dec(struct trace_array *tr, return 0; *data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&(*data)->disabled); + disabled = local_inc_return(&(*data)->disabled); if (likely(disabled == 1)) return 1; - atomic_dec(&(*data)->disabled); + local_dec(&(*data)->disabled); return 0; } @@ -152,7 +152,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, trace_function(tr, ip, parent_ip, trace_ctx, fregs); - atomic_dec(&data->disabled); + local_dec(&data->disabled); } #endif /* CONFIG_FUNCTION_TRACER */ @@ -209,7 +209,7 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, trace_ctx = tracing_gen_ctx_flags(flags); ret = __trace_graph_entry(tr, trace, trace_ctx); - atomic_dec(&data->disabled); + local_dec(&data->disabled); return ret; } @@ -238,7 +238,7 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace, trace_ctx = tracing_gen_ctx_flags(flags); __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); - atomic_dec(&data->disabled); + local_dec(&data->disabled); } static struct fgraph_ops fgraph_ops = { @@ -408,10 +408,10 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) data = per_cpu_ptr(tr->array_buffer.data, cpu); - if (unlikely(!data) || atomic_read(&data->disabled)) + if (unlikely(!data) || local_read(&data->disabled)) return; - atomic_inc(&data->disabled); + local_inc(&data->disabled); data->critical_sequence = max_sequence; data->preempt_timestamp = ftrace_now(cpu); @@ -421,7 +421,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) per_cpu(tracing_cpu, cpu) = 1; - atomic_dec(&data->disabled); + local_dec(&data->disabled); } static nokprobe_inline void @@ -445,16 +445,16 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) data = per_cpu_ptr(tr->array_buffer.data, cpu); if (unlikely(!data) || - !data->critical_start || atomic_read(&data->disabled)) + !data->critical_start || local_read(&data->disabled)) return; - atomic_inc(&data->disabled); + local_inc(&data->disabled); trace_ctx = tracing_gen_ctx(); __trace_function(tr, ip, parent_ip, trace_ctx); check_critical_timing(tr, data, parent_ip ? : ip, cpu); data->critical_start = 0; - atomic_dec(&data->disabled); + local_dec(&data->disabled); } /* start and stop critical timings used to for stoppage (in idle) */ diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index a0db3404f7f7..bf1cb80742ae 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -83,14 +83,14 @@ func_prolog_preempt_disable(struct trace_array *tr, goto out_enable; *data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&(*data)->disabled); + disabled = local_inc_return(&(*data)->disabled); if (unlikely(disabled != 1)) goto out; return 1; out: - atomic_dec(&(*data)->disabled); + local_dec(&(*data)->disabled); out_enable: preempt_enable_notrace(); @@ -144,7 +144,7 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace, *calltime = trace_clock_local(); ret = __trace_graph_entry(tr, trace, trace_ctx); - atomic_dec(&data->disabled); + local_dec(&data->disabled); preempt_enable_notrace(); return ret; @@ -173,7 +173,7 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace, return; __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); - atomic_dec(&data->disabled); + local_dec(&data->disabled); preempt_enable_notrace(); return; @@ -243,7 +243,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, trace_function(tr, ip, parent_ip, trace_ctx, fregs); local_irq_restore(flags); - atomic_dec(&data->disabled); + local_dec(&data->disabled); preempt_enable_notrace(); } @@ -471,7 +471,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + disabled = local_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); if (likely(disabled != 1)) goto out; @@ -508,7 +508,7 @@ out_unlock: arch_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: - atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + local_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); } static void __wakeup_reset(struct trace_array *tr) @@ -563,7 +563,7 @@ probe_wakeup(void *ignore, struct task_struct *p) (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) return; - disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + disabled = local_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); if (unlikely(disabled != 1)) goto out; @@ -610,7 +610,7 @@ probe_wakeup(void *ignore, struct task_struct *p) out_locked: arch_spin_unlock(&wakeup_lock); out: - atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + local_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); } static void start_wakeup_tracer(struct trace_array *tr) -- cgit v1.2.3 From c4a80c06154084d52851fe495d40b0e4da032f47 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 5 May 2025 17:21:16 -0400 Subject: tracing: Use atomic_inc_return() for updating "disabled" counter in irqsoff tracer The irqsoff tracer uses the per CPU "disabled" field to prevent corruption of the accounting when it starts to trace interrupts disabled, but there's a slight race that could happen if for some reason it was called twice. Use atomic_inc_return() instead. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250505212236.567884756@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_irqsoff.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 0b6d932a931e..5496758b6c76 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -397,6 +397,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) int cpu; struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; + long disabled; if (!tracer_enabled || !tracing_is_enabled()) return; @@ -411,15 +412,17 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) if (unlikely(!data) || local_read(&data->disabled)) return; - local_inc(&data->disabled); + disabled = local_inc_return(&data->disabled); - data->critical_sequence = max_sequence; - data->preempt_timestamp = ftrace_now(cpu); - data->critical_start = parent_ip ? : ip; + if (disabled == 1) { + data->critical_sequence = max_sequence; + data->preempt_timestamp = ftrace_now(cpu); + data->critical_start = parent_ip ? : ip; - __trace_function(tr, ip, parent_ip, tracing_gen_ctx()); + __trace_function(tr, ip, parent_ip, tracing_gen_ctx()); - per_cpu(tracing_cpu, cpu) = 1; + per_cpu(tracing_cpu, cpu) = 1; + } local_dec(&data->disabled); } @@ -431,6 +434,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned int trace_ctx; + long disabled; cpu = raw_smp_processor_id(); /* Always clear the tracing cpu on stopping the trace */ @@ -448,12 +452,15 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) !data->critical_start || local_read(&data->disabled)) return; - local_inc(&data->disabled); + disabled = local_inc_return(&data->disabled); + + if (disabled == 1) { + trace_ctx = tracing_gen_ctx(); + __trace_function(tr, ip, parent_ip, trace_ctx); + check_critical_timing(tr, data, parent_ip ? : ip, cpu); + data->critical_start = 0; + } - trace_ctx = tracing_gen_ctx(); - __trace_function(tr, ip, parent_ip, trace_ctx); - check_critical_timing(tr, data, parent_ip ? : ip, cpu); - data->critical_start = 0; local_dec(&data->disabled); } -- cgit v1.2.3 From 6e3b3acaf452ecbd0bde5f25e05c844a22a86574 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 5 May 2025 17:21:17 -0400 Subject: tracing: Remove unused buffer_page field from trace_array_cpu structure The trace_array_cpu had a "buffer_page" field that was originally going to be used as a backup page for the ring buffer. But the ring buffer has its own way of reusing pages and this field was never used. Remove it. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250505212236.738849456@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 188032d4ab69..4e67ee92e05c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -184,7 +184,6 @@ struct trace_array; */ struct trace_array_cpu { local_t disabled; - void *buffer_page; /* ring buffer spare */ unsigned long entries; unsigned long saved_latency; -- cgit v1.2.3 From 73207746d36bff777a2edb337d8d0518cbd44715 Mon Sep 17 00:00:00 2001 From: Devaansh Kumar Date: Wed, 7 May 2025 19:08:36 +0530 Subject: tracing: Replace deprecated strncpy() with strscpy() for stack_trace_filter_buf strncpy() is deprecated for NUL-terminated destination buffers and must be replaced by strscpy(). See issue: https://github.com/KSPP/linux/issues/90 Link: https://lore.kernel.org/20250507133837.19640-1-devaanshk840@gmail.com Signed-off-by: Devaansh Kumar Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_stack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 14c6f272c4d8..4c349db381cb 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -542,7 +542,7 @@ static __init int enable_stacktrace(char *str) int len; if ((len = str_has_prefix(str, "_filter="))) - strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE); + strscpy(stack_trace_filter_buf, str + len); stack_tracer_enabled = 1; return 1; -- cgit v1.2.3 From f2947c4b7d0f235621c5daf78aecfbd6e22c05e5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 7 May 2025 10:53:06 -0400 Subject: tracing: Rename event_trigger_alloc() to trigger_data_alloc() The function event_trigger_alloc() creates an event_trigger_data descriptor and states that it needs to be freed via event_trigger_free(). This is incorrect, it needs to be freed by trigger_data_free() as event_trigger_free() adds ref counting. Rename event_trigger_alloc() to trigger_data_alloc() and state that it needs to be freed via trigger_data_free(). This naming convention was introducing bugs. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tom Zanussi Link: https://lore.kernel.org/20250507145455.776436410@goodmis.org Fixes: 86599dbe2c527 ("tracing: Add helper functions to simplify event_command.parse() callback handling") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 8 +++----- kernel/trace/trace_events_hist.c | 2 +- kernel/trace/trace_events_trigger.c | 16 ++++++++-------- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4e67ee92e05c..86e9d7dcddba 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1788,6 +1788,9 @@ extern int event_enable_register_trigger(char *glob, extern void event_enable_unregister_trigger(char *glob, struct event_trigger_data *test, struct trace_event_file *file); +extern struct event_trigger_data * +trigger_data_alloc(struct event_command *cmd_ops, char *cmd, char *param, + void *private_data); extern void trigger_data_free(struct event_trigger_data *data); extern int event_trigger_init(struct event_trigger_data *data); extern int trace_event_trigger_enable_disable(struct trace_event_file *file, @@ -1814,11 +1817,6 @@ extern bool event_trigger_check_remove(const char *glob); extern bool event_trigger_empty_param(const char *param); extern int event_trigger_separate_filter(char *param_and_filter, char **param, char **filter, bool param_required); -extern struct event_trigger_data * -event_trigger_alloc(struct event_command *cmd_ops, - char *cmd, - char *param, - void *private_data); extern int event_trigger_parse_num(char *trigger, struct event_trigger_data *trigger_data); extern int event_trigger_set_filter(struct event_command *cmd_ops, diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 58c9535f61df..1d536219b624 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6826,7 +6826,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, return PTR_ERR(hist_data); } - trigger_data = event_trigger_alloc(cmd_ops, cmd, param, hist_data); + trigger_data = trigger_data_alloc(cmd_ops, cmd, param, hist_data); if (!trigger_data) { ret = -ENOMEM; goto out_free; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index b66b6d235d91..dac3344ee345 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -804,7 +804,7 @@ out: } /** - * event_trigger_alloc - allocate and init event_trigger_data for a trigger + * trigger_data_alloc - allocate and init event_trigger_data for a trigger * @cmd_ops: The event_command operations for the trigger * @cmd: The cmd string * @param: The param string @@ -815,14 +815,14 @@ out: * trigger_ops to assign to the event_trigger_data. @private_data can * also be passed in and associated with the event_trigger_data. * - * Use event_trigger_free() to free an event_trigger_data object. + * Use trigger_data_free() to free an event_trigger_data object. * * Return: The trigger_data object success, NULL otherwise */ -struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops, - char *cmd, - char *param, - void *private_data) +struct event_trigger_data *trigger_data_alloc(struct event_command *cmd_ops, + char *cmd, + char *param, + void *private_data) { struct event_trigger_data *trigger_data; const struct event_trigger_ops *trigger_ops; @@ -989,7 +989,7 @@ event_trigger_parse(struct event_command *cmd_ops, return ret; ret = -ENOMEM; - trigger_data = event_trigger_alloc(cmd_ops, cmd, param, file); + trigger_data = trigger_data_alloc(cmd_ops, cmd, param, file); if (!trigger_data) goto out; @@ -1793,7 +1793,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, enable_data->enable = enable; enable_data->file = event_enable_file; - trigger_data = event_trigger_alloc(cmd_ops, cmd, param, enable_data); + trigger_data = trigger_data_alloc(cmd_ops, cmd, param, enable_data); if (!trigger_data) { kfree(enable_data); goto out; -- cgit v1.2.3 From c5dd28e7fb4f63475b50df4f58311df92939d011 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Wed, 7 May 2025 10:53:07 -0400 Subject: tracing: Fix error handling in event_trigger_parse() According to trigger_data_alloc() doc, trigger_data_free() should be used to free an event_trigger_data object. This fixes a mismatch introduced when kzalloc was replaced with trigger_data_alloc without updating the corresponding deallocation calls. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Mathieu Desnoyers Cc: Tom Zanussi Link: https://lore.kernel.org/20250507145455.944453325@goodmis.org Link: https://lore.kernel.org/20250318112737.4174-1-linmq006@gmail.com Fixes: e1f187d09e11 ("tracing: Have existing event_command.parse() implementations use helpers") Signed-off-by: Miaoqian Lin [ SDR: Changed event_trigger_alloc/free() to trigger_data_alloc/free() ] Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index dac3344ee345..c316badc608b 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -995,7 +995,7 @@ event_trigger_parse(struct event_command *cmd_ops, if (remove) { event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); - kfree(trigger_data); + trigger_data_free(trigger_data); ret = 0; goto out; } @@ -1022,7 +1022,7 @@ event_trigger_parse(struct event_command *cmd_ops, out_free: event_trigger_reset_filter(cmd_ops, trigger_data); - kfree(trigger_data); + trigger_data_free(trigger_data); goto out; } -- cgit v1.2.3 From f75340d73c5e25bd94f964a8e3dac9fd3e5f1475 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 7 May 2025 10:53:08 -0400 Subject: tracing: Remove unnecessary "goto out" that simply returns ret is trigger code There's several functions that have "goto out;" where the label out is just: out: return ret; Simplify the code by just doing the return in the location and removing all the out labels and jumps. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tom Zanussi Link: https://lore.kernel.org/20250507145456.121186494@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 44 ++++++++++++++----------------------- 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index c316badc608b..fdd1112388e9 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -552,16 +552,14 @@ static int register_trigger(char *glob, lockdep_assert_held(&event_mutex); list_for_each_entry(test, &file->triggers, list) { - if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) { - ret = -EEXIST; - goto out; - } + if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) + return -EEXIST; } if (data->ops->init) { ret = data->ops->init(data); if (ret < 0) - goto out; + return ret; } list_add_rcu(&data->list, &file->triggers); @@ -572,7 +570,6 @@ static int register_trigger(char *glob, list_del_rcu(&data->list); update_cond_flag(file); } -out: return ret; } @@ -770,7 +767,7 @@ int event_trigger_separate_filter(char *param_and_filter, char **param, if (!param_and_filter) { if (param_required) ret = -EINVAL; - goto out; + return ret; } /* @@ -781,7 +778,7 @@ int event_trigger_separate_filter(char *param_and_filter, char **param, */ if (!param_required && param_and_filter && !isdigit(param_and_filter[0])) { *filter = param_and_filter; - goto out; + return ret; } /* @@ -799,7 +796,6 @@ int event_trigger_separate_filter(char *param_and_filter, char **param, if (!**filter) *filter = NULL; } -out: return ret; } @@ -991,13 +987,12 @@ event_trigger_parse(struct event_command *cmd_ops, ret = -ENOMEM; trigger_data = trigger_data_alloc(cmd_ops, cmd, param, file); if (!trigger_data) - goto out; + return ret; if (remove) { event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); trigger_data_free(trigger_data); - ret = 0; - goto out; + return 0; } ret = event_trigger_parse_num(param, trigger_data); @@ -1017,13 +1012,12 @@ event_trigger_parse(struct event_command *cmd_ops, /* Down the counter of trigger_data or free it if not used anymore */ event_trigger_free(trigger_data); - out: return ret; out_free: event_trigger_reset_filter(cmd_ops, trigger_data); trigger_data_free(trigger_data); - goto out; + return ret; } /** @@ -1057,10 +1051,10 @@ int set_trigger_filter(char *filter_str, s = strsep(&filter_str, " \t"); if (!strlen(s) || strcmp(s, "if") != 0) - goto out; + return ret; if (!filter_str) - goto out; + return ret; /* The filter is for the 'trigger' event, not the triggered event */ ret = create_event_filter(file->tr, file->event_call, @@ -1104,7 +1098,6 @@ int set_trigger_filter(char *filter_str, ret = -ENOMEM; } } - out: return ret; } @@ -1772,7 +1765,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, ret = -EINVAL; event_enable_file = find_event_file(tr, system, event); if (!event_enable_file) - goto out; + return ret; #ifdef CONFIG_HIST_TRIGGERS hist = ((strcmp(cmd, ENABLE_HIST_STR) == 0) || @@ -1787,7 +1780,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL); if (!enable_data) - goto out; + return ret; enable_data->hist = hist; enable_data->enable = enable; @@ -1796,7 +1789,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, trigger_data = trigger_data_alloc(cmd_ops, cmd, param, enable_data); if (!trigger_data) { kfree(enable_data); - goto out; + return ret; } if (remove) { @@ -1804,7 +1797,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, kfree(trigger_data); kfree(enable_data); ret = 0; - goto out; + return ret; } /* Up the trigger_data count to make sure nothing frees it on failure */ @@ -1834,7 +1827,6 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, goto out_disable; event_trigger_free(trigger_data); - out: return ret; out_disable: trace_event_enable_disable(event_enable_file, 0, 1); @@ -1845,7 +1837,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, event_trigger_free(trigger_data); kfree(enable_data); - goto out; + return ret; } int event_enable_register_trigger(char *glob, @@ -1865,15 +1857,14 @@ int event_enable_register_trigger(char *glob, (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) && (test_enable_data->file == enable_data->file)) { - ret = -EEXIST; - goto out; + return -EEXIST; } } if (data->ops->init) { ret = data->ops->init(data); if (ret < 0) - goto out; + return ret; } list_add_rcu(&data->list, &file->triggers); @@ -1884,7 +1875,6 @@ int event_enable_register_trigger(char *glob, list_del_rcu(&data->list); update_cond_flag(file); } -out: return ret; } -- cgit v1.2.3 From 6956ea9fdcf706dc6201bf7146af188539eac9b1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 7 May 2025 19:17:03 -0400 Subject: tracing: Add a helper function to handle the dereference arg in verifier Add a helper function called handle_dereference_arg() to replace the logic that is identical in two locations of test_event_printk(). Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250507191703.5dd8a61d@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index fe0ea14d809e..120531268abf 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -400,6 +400,20 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca return true; } +static void handle_dereference_arg(const char *arg_str, u64 string_flags, int len, + u64 *dereference_flags, int arg, + struct trace_event_call *call) +{ + if (string_flags & (1ULL << arg)) { + if (process_string(arg_str, len, call)) + *dereference_flags &= ~(1ULL << arg); + } else if (process_pointer(arg_str, len, call)) + *dereference_flags &= ~(1ULL << arg); + else + pr_warn("TRACE EVENT ERROR: Bad dereference argument: '%.*s'\n", + len, arg_str); +} + /* * Examine the print fmt of the event looking for unsafe dereference * pointers using %p* that could be recorded in the trace event and @@ -563,11 +577,9 @@ static void test_event_printk(struct trace_event_call *call) } if (dereference_flags & (1ULL << arg)) { - if (string_flags & (1ULL << arg)) { - if (process_string(fmt + start_arg, e - start_arg, call)) - dereference_flags &= ~(1ULL << arg); - } else if (process_pointer(fmt + start_arg, e - start_arg, call)) - dereference_flags &= ~(1ULL << arg); + handle_dereference_arg(fmt + start_arg, string_flags, + e - start_arg, + &dereference_flags, arg, call); } start_arg = i; @@ -578,11 +590,9 @@ static void test_event_printk(struct trace_event_call *call) } if (dereference_flags & (1ULL << arg)) { - if (string_flags & (1ULL << arg)) { - if (process_string(fmt + start_arg, i - start_arg, call)) - dereference_flags &= ~(1ULL << arg); - } else if (process_pointer(fmt + start_arg, i - start_arg, call)) - dereference_flags &= ~(1ULL << arg); + handle_dereference_arg(fmt + start_arg, string_flags, + i - start_arg, + &dereference_flags, arg, call); } /* -- cgit v1.2.3 From 7b382efd5e8af4c0c67e70ad3fb599dcd2dc0b86 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 8 May 2025 09:56:39 -0400 Subject: tracing: Allow the top level trace_marker to write into another instances There are applications that have it hard coded to write into the top level trace_marker instance (/sys/kernel/tracing/trace_marker). This can be annoying if a profiler is using that instance for other work, or if it needs all writes to go into a new instance. A new option is created called "copy_trace_marker". By default, the top level has this set, as that is the default buffer that writing into the top level trace_marker file will go to. But now if an instance is created and sets this option, all writes into the top level trace_marker will also be written into that instance buffer just as if an application were to write into the instance's trace_marker file. If the top level instance disables this option, then writes to its own trace_marker and trace_marker_raw files will not go into its buffer. If no instance has this option set, then the write will return an error and errno will contain ENODEV. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250508095639.39f84eda@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/ftrace.rst | 13 ++++ kernel/trace/trace.c | 144 ++++++++++++++++++++++++++++++++--------- kernel/trace/trace.h | 2 + 3 files changed, 128 insertions(+), 31 deletions(-) diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index c9e88bf65709..af66a05e18cc 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -1205,6 +1205,19 @@ Here are the available options: default instance. The only way the top level instance has this flag cleared, is by it being set in another instance. + copy_trace_marker + If there are applications that hard code writing into the top level + trace_marker file (/sys/kernel/tracing/trace_marker or trace_marker_raw), + and the tooling would like it to go into an instance, this option can + be used. Create an instance and set this option, and then all writes + into the top level trace_marker file will also be redirected into this + instance. + + Note, by default this option is set for the top level instance. If it + is disabled, then writes to the trace_marker or trace_marker_raw files + will not be written into the top level file. If no instance has this + option set, then a write will error with the errno of ENODEV. + annotate It is sometimes confusing when the CPU buffers are full and one CPU buffer had a lot of events recently, thus diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0cd681516438..cf51c30b137f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -493,7 +493,8 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export); TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \ TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \ TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \ - TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK) + TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK | \ + TRACE_ITER_COPY_MARKER) /* trace_options that are only supported by global_trace */ #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ @@ -501,7 +502,8 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export); /* trace_flags that are default zero for instances */ #define ZEROED_TRACE_FLAGS \ - (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK) + (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK | \ + TRACE_ITER_COPY_MARKER) /* * The global_trace is the descriptor that holds the top-level tracing @@ -513,6 +515,9 @@ static struct trace_array global_trace = { static struct trace_array *printk_trace = &global_trace; +/* List of trace_arrays interested in the top level trace_marker */ +static LIST_HEAD(marker_copies); + static __always_inline bool printk_binsafe(struct trace_array *tr) { /* @@ -534,6 +539,28 @@ static void update_printk_trace(struct trace_array *tr) tr->trace_flags |= TRACE_ITER_TRACE_PRINTK; } +/* Returns true if the status of tr changed */ +static bool update_marker_trace(struct trace_array *tr, int enabled) +{ + lockdep_assert_held(&event_mutex); + + if (enabled) { + if (!list_empty(&tr->marker_list)) + return false; + + list_add_rcu(&tr->marker_list, &marker_copies); + tr->trace_flags |= TRACE_ITER_COPY_MARKER; + return true; + } + + if (list_empty(&tr->marker_list)) + return false; + + list_del_init(&tr->marker_list); + tr->trace_flags &= ~TRACE_ITER_COPY_MARKER; + return true; +} + void trace_set_ring_buffer_expanded(struct trace_array *tr) { if (!tr) @@ -5220,7 +5247,8 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { if ((mask == TRACE_ITER_RECORD_TGID) || (mask == TRACE_ITER_RECORD_CMD) || - (mask == TRACE_ITER_TRACE_PRINTK)) + (mask == TRACE_ITER_TRACE_PRINTK) || + (mask == TRACE_ITER_COPY_MARKER)) lockdep_assert_held(&event_mutex); /* do nothing if flag is already set */ @@ -5251,6 +5279,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) } } + if (mask == TRACE_ITER_COPY_MARKER) + update_marker_trace(tr, enabled); + if (enabled) tr->trace_flags |= mask; else @@ -7134,11 +7165,9 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) #define TRACE_MARKER_MAX_SIZE 4096 -static ssize_t -tracing_mark_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *fpos) +static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user *ubuf, + size_t cnt, unsigned long ip) { - struct trace_array *tr = filp->private_data; struct ring_buffer_event *event; enum event_trigger_type tt = ETT_NONE; struct trace_buffer *buffer; @@ -7152,18 +7181,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, #define FAULTED_STR "" #define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */ - if (tracing_disabled) - return -EINVAL; - - if (!(tr->trace_flags & TRACE_ITER_MARKERS)) - return -EINVAL; - - if ((ssize_t)cnt < 0) - return -EINVAL; - - if (cnt > TRACE_MARKER_MAX_SIZE) - cnt = TRACE_MARKER_MAX_SIZE; - meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */ again: size = cnt + meta_size; @@ -7196,7 +7213,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, } entry = ring_buffer_event_data(event); - entry->ip = _THIS_IP_; + entry->ip = ip; len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); if (len) { @@ -7229,18 +7246,12 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, } static ssize_t -tracing_mark_raw_write(struct file *filp, const char __user *ubuf, +tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { struct trace_array *tr = filp->private_data; - struct ring_buffer_event *event; - struct trace_buffer *buffer; - struct raw_data_entry *entry; - ssize_t written; - int size; - int len; - -#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) + ssize_t written = -ENODEV; + unsigned long ip; if (tracing_disabled) return -EINVAL; @@ -7248,10 +7259,42 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, if (!(tr->trace_flags & TRACE_ITER_MARKERS)) return -EINVAL; - /* The marker must at least have a tag id */ - if (cnt < sizeof(unsigned int)) + if ((ssize_t)cnt < 0) return -EINVAL; + if (cnt > TRACE_MARKER_MAX_SIZE) + cnt = TRACE_MARKER_MAX_SIZE; + + /* The selftests expect this function to be the IP address */ + ip = _THIS_IP_; + + /* The global trace_marker can go to multiple instances */ + if (tr == &global_trace) { + guard(rcu)(); + list_for_each_entry_rcu(tr, &marker_copies, marker_list) { + written = write_marker_to_buffer(tr, ubuf, cnt, ip); + if (written < 0) + break; + } + } else { + written = write_marker_to_buffer(tr, ubuf, cnt, ip); + } + + return written; +} + +static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, + const char __user *ubuf, size_t cnt) +{ + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct raw_data_entry *entry; + ssize_t written; + int size; + int len; + +#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) + size = sizeof(*entry) + cnt; if (cnt < FAULT_SIZE_ID) size += FAULT_SIZE_ID - cnt; @@ -7282,6 +7325,40 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, return written; } +static ssize_t +tracing_mark_raw_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + struct trace_array *tr = filp->private_data; + ssize_t written = -ENODEV; + +#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) + + if (tracing_disabled) + return -EINVAL; + + if (!(tr->trace_flags & TRACE_ITER_MARKERS)) + return -EINVAL; + + /* The marker must at least have a tag id */ + if (cnt < sizeof(unsigned int)) + return -EINVAL; + + /* The global trace_marker_raw can go to multiple instances */ + if (tr == &global_trace) { + guard(rcu)(); + list_for_each_entry_rcu(tr, &marker_copies, marker_list) { + written = write_raw_marker_to_buffer(tr, ubuf, cnt); + if (written < 0) + break; + } + } else { + written = write_raw_marker_to_buffer(tr, ubuf, cnt); + } + + return written; +} + static int tracing_clock_show(struct seq_file *m, void *v) { struct trace_array *tr = m->private; @@ -9775,6 +9852,7 @@ trace_array_create_systems(const char *name, const char *systems, INIT_LIST_HEAD(&tr->events); INIT_LIST_HEAD(&tr->hist_vars); INIT_LIST_HEAD(&tr->err_log); + INIT_LIST_HEAD(&tr->marker_list); #ifdef CONFIG_MODULES INIT_LIST_HEAD(&tr->mod_events); @@ -9934,6 +10012,9 @@ static int __remove_instance(struct trace_array *tr) if (printk_trace == tr) update_printk_trace(&global_trace); + if (update_marker_trace(tr, 0)) + synchronize_rcu(); + tracing_set_nop(tr); clear_ftrace_function_probes(tr); event_trace_del_tracer(tr); @@ -10999,6 +11080,7 @@ __init static int tracer_alloc_buffers(void) INIT_LIST_HEAD(&global_trace.events); INIT_LIST_HEAD(&global_trace.hist_vars); INIT_LIST_HEAD(&global_trace.err_log); + list_add(&global_trace.marker_list, &marker_copies); list_add(&global_trace.list, &ftrace_trace_arrays); apply_trace_boot_options(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 86e9d7dcddba..bd084953a98b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -403,6 +403,7 @@ struct trace_array { struct trace_options *topts; struct list_head systems; struct list_head events; + struct list_head marker_list; struct trace_event_file *trace_marker_file; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ /* one per_cpu trace_pipe can be opened by only one user */ @@ -1384,6 +1385,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(MARKERS, "markers"), \ C(EVENT_FORK, "event-fork"), \ C(TRACE_PRINTK, "trace_printk_dest"), \ + C(COPY_MARKER, "copy_trace_marker"),\ C(PAUSE_ON_TRACE, "pause-on-trace"), \ C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \ FUNCTION_FLAGS \ -- cgit v1.2.3 From 45c28cdce7a1648c12bb8f546a67abf908db106e Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 29 Apr 2025 15:51:18 -0400 Subject: tracing: Cleanup upper_empty() in pid_list Instead of find_first_bit() use the dedicated bitmap_empty(), and make upper_empty() a nice one-liner. While there, fix opencoded BITS_PER_TYPE(). Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250429195119.620204-1-yury.norov@gmail.com Signed-off-by: Yury Norov Signed-off-by: Steven Rostedt (Google) --- kernel/trace/pid_list.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index c62b9b3cfb3d..090bb5ea4a19 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -81,13 +81,9 @@ static inline bool upper_empty(union upper_chunk *chunk) { /* * If chunk->data has no lower chunks, it will be the same - * as a zeroed bitmask. Use find_first_bit() to test it - * and if it doesn't find any bits set, then the array - * is empty. + * as a zeroed bitmask. */ - int bit = find_first_bit((unsigned long *)chunk->data, - sizeof(chunk->data) * 8); - return bit >= sizeof(chunk->data) * 8; + return bitmap_empty((unsigned long *)chunk->data, BITS_PER_TYPE(chunk->data)); } static inline int pid_split(unsigned int pid, unsigned int *upper1, -- cgit v1.2.3 From ac01fa73f5309a35eff83be61442a8891159b487 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 10 May 2025 16:37:30 -0400 Subject: tracepoint: Have tracepoints created with DECLARE_TRACE() have _tp suffix Most tracepoints in the kernel are created with TRACE_EVENT(). The TRACE_EVENT() macro (and DECLARE_EVENT_CLASS() and DEFINE_EVENT() where in reality, TRACE_EVENT() is just a helper macro that calls those other two macros), will create not only a tracepoint (the function trace_() used in the kernel), it also exposes the tracepoint to user space along with defining what fields will be saved by that tracepoint. There are a few places that tracepoints are created in the kernel that are not exposed to userspace via tracefs. They can only be accessed from code within the kernel. These tracepoints are created with DEFINE_TRACE() Most of these tracepoints end with "_tp". This is useful as when the developer sees that, they know that the tracepoint is for in-kernel only (meaning it can only be accessed inside the kernel, either directly by the kernel or indirectly via modules and BPF programs) and is not exposed to user space. Instead of making this only a process to add "_tp", enforce it by making the DECLARE_TRACE() append the "_tp" suffix to the tracepoint. This requires adding DECLARE_TRACE_EVENT() macros for the TRACE_EVENT() macro to use that keeps the original name. Link: https://lore.kernel.org/all/20250418083351.20a60e64@gandalf.local.home/ Cc: netdev Cc: Jiri Olsa Cc: Peter Zijlstra Cc: David Ahern Cc: Juri Lelli Cc: Breno Leitao Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Gabriele Monaco Cc: Masami Hiramatsu Link: https://lore.kernel.org/20250510163730.092fad5b@gandalf.local.home Acked-by: Mathieu Desnoyers Acked-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/tracepoints.rst | 17 ++++++---- include/linux/tracepoint.h | 38 +++++++++++++++------- include/trace/bpf_probe.h | 8 ++--- include/trace/define_trace.h | 17 +++++++++- include/trace/events/sched.h | 30 ++++++++--------- include/trace/events/tcp.h | 2 +- tools/testing/selftests/bpf/progs/raw_tp_null.c | 2 +- .../testing/selftests/bpf/progs/raw_tp_null_fail.c | 2 +- .../selftests/bpf/progs/test_module_attach.c | 4 +-- .../selftests/bpf/progs/test_tp_btf_nullable.c | 4 +-- .../testing/selftests/bpf/test_kmods/bpf_testmod.c | 8 ++--- 11 files changed, 83 insertions(+), 49 deletions(-) diff --git a/Documentation/trace/tracepoints.rst b/Documentation/trace/tracepoints.rst index decabcc77b56..b35c40e3abbe 100644 --- a/Documentation/trace/tracepoints.rst +++ b/Documentation/trace/tracepoints.rst @@ -71,7 +71,7 @@ In subsys/file.c (where the tracing statement must be added):: void somefct(void) { ... - trace_subsys_eventname(arg, task); + trace_subsys_eventname_tp(arg, task); ... } @@ -129,12 +129,12 @@ within an if statement with the following:: for (i = 0; i < count; i++) tot += calculate_nuggets(); - trace_foo_bar(tot); + trace_foo_bar_tp(tot); } -All trace_() calls have a matching trace__enabled() +All trace__tp() calls have a matching trace__enabled() function defined that returns true if the tracepoint is enabled and -false otherwise. The trace_() should always be within the +false otherwise. The trace__tp() should always be within the block of the if (trace__enabled()) to prevent races between the tracepoint being enabled and the check being seen. @@ -143,7 +143,10 @@ the static_key of the tracepoint to allow the if statement to be implemented with jump labels and avoid conditional branches. .. note:: The convenience macro TRACE_EVENT provides an alternative way to - define tracepoints. Check http://lwn.net/Articles/379903, + define tracepoints. Note, DECLARE_TRACE(foo) creates a function + "trace_foo_tp()" whereas TRACE_EVENT(foo) creates a function + "trace_foo()", and also exposes the tracepoint as a trace event in + /sys/kernel/tracing/events directory. Check http://lwn.net/Articles/379903, http://lwn.net/Articles/381064 and http://lwn.net/Articles/383362 for a series of articles with more details. @@ -159,7 +162,9 @@ In a C file:: void do_trace_foo_bar_wrapper(args) { - trace_foo_bar(args); + trace_foo_bar_tp(args); // for tracepoints created via DECLARE_TRACE + // or + trace_foo_bar(args); // for tracepoints created via TRACE_EVENT } In the header file:: diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index a351763e6965..826ce3f8e1f8 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -464,16 +464,30 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #endif #define DECLARE_TRACE(name, proto, args) \ - __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ + __DECLARE_TRACE(name##_tp, PARAMS(proto), PARAMS(args), \ cpu_online(raw_smp_processor_id()), \ PARAMS(void *__data, proto)) #define DECLARE_TRACE_CONDITION(name, proto, args, cond) \ - __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ + __DECLARE_TRACE(name##_tp, PARAMS(proto), PARAMS(args), \ cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \ PARAMS(void *__data, proto)) #define DECLARE_TRACE_SYSCALL(name, proto, args) \ + __DECLARE_TRACE_SYSCALL(name##_tp, PARAMS(proto), PARAMS(args), \ + PARAMS(void *__data, proto)) + +#define DECLARE_TRACE_EVENT(name, proto, args) \ + __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ + cpu_online(raw_smp_processor_id()), \ + PARAMS(void *__data, proto)) + +#define DECLARE_TRACE_EVENT_CONDITION(name, proto, args, cond) \ + __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ + cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \ + PARAMS(void *__data, proto)) + +#define DECLARE_TRACE_EVENT_SYSCALL(name, proto, args) \ __DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args), \ PARAMS(void *__data, proto)) @@ -591,32 +605,32 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, print) #define DEFINE_EVENT(template, name, proto, args) \ - DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) + DECLARE_TRACE_EVENT(name, PARAMS(proto), PARAMS(args)) #define DEFINE_EVENT_FN(template, name, proto, args, reg, unreg)\ - DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) + DECLARE_TRACE_EVENT(name, PARAMS(proto), PARAMS(args)) #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ - DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) + DECLARE_TRACE_EVENT(name, PARAMS(proto), PARAMS(args)) #define DEFINE_EVENT_CONDITION(template, name, proto, \ args, cond) \ - DECLARE_TRACE_CONDITION(name, PARAMS(proto), \ + DECLARE_TRACE_EVENT_CONDITION(name, PARAMS(proto), \ PARAMS(args), PARAMS(cond)) #define TRACE_EVENT(name, proto, args, struct, assign, print) \ - DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) + DECLARE_TRACE_EVENT(name, PARAMS(proto), PARAMS(args)) #define TRACE_EVENT_FN(name, proto, args, struct, \ assign, print, reg, unreg) \ - DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) -#define TRACE_EVENT_FN_COND(name, proto, args, cond, struct, \ + DECLARE_TRACE_EVENT(name, PARAMS(proto), PARAMS(args)) +#define TRACE_EVENT_FN_COND(name, proto, args, cond, struct, \ assign, print, reg, unreg) \ - DECLARE_TRACE_CONDITION(name, PARAMS(proto), \ + DECLARE_TRACE_EVENT_CONDITION(name, PARAMS(proto), \ PARAMS(args), PARAMS(cond)) #define TRACE_EVENT_CONDITION(name, proto, args, cond, \ struct, assign, print) \ - DECLARE_TRACE_CONDITION(name, PARAMS(proto), \ + DECLARE_TRACE_EVENT_CONDITION(name, PARAMS(proto), \ PARAMS(args), PARAMS(cond)) #define TRACE_EVENT_SYSCALL(name, proto, args, struct, assign, \ print, reg, unreg) \ - DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args)) + DECLARE_TRACE_EVENT_SYSCALL(name, PARAMS(proto), PARAMS(args)) #define TRACE_EVENT_FLAGS(event, flag) diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index 183fa2aa2935..9391d54d3f12 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -119,14 +119,14 @@ static inline void bpf_test_buffer_##call(void) \ #undef DECLARE_TRACE #define DECLARE_TRACE(call, proto, args) \ - __BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) \ - __DEFINE_EVENT(call, call, PARAMS(proto), PARAMS(args), 0) + __BPF_DECLARE_TRACE(call##_tp, PARAMS(proto), PARAMS(args)) \ + __DEFINE_EVENT(call##_tp, call##_tp, PARAMS(proto), PARAMS(args), 0) #undef DECLARE_TRACE_WRITABLE #define DECLARE_TRACE_WRITABLE(call, proto, args, size) \ __CHECK_WRITABLE_BUF_SIZE(call, PARAMS(proto), PARAMS(args), size) \ - __BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) \ - __DEFINE_EVENT(call, call, PARAMS(proto), PARAMS(args), size) + __BPF_DECLARE_TRACE(call##_tp, PARAMS(proto), PARAMS(args)) \ + __DEFINE_EVENT(call##_tp, call##_tp, PARAMS(proto), PARAMS(args), size) #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index ed52d0506c69..b2ba5a80583f 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -74,10 +74,18 @@ #undef DECLARE_TRACE #define DECLARE_TRACE(name, proto, args) \ - DEFINE_TRACE(name, PARAMS(proto), PARAMS(args)) + DEFINE_TRACE(name##_tp, PARAMS(proto), PARAMS(args)) #undef DECLARE_TRACE_CONDITION #define DECLARE_TRACE_CONDITION(name, proto, args, cond) \ + DEFINE_TRACE(name##_tp, PARAMS(proto), PARAMS(args)) + +#undef DECLARE_TRACE_EVENT +#define DECLARE_TRACE_EVENT(name, proto, args) \ + DEFINE_TRACE(name, PARAMS(proto), PARAMS(args)) + +#undef DECLARE_TRACE_EVENT_CONDITION +#define DECLARE_TRACE_EVENT_CONDITION(name, proto, args, cond) \ DEFINE_TRACE(name, PARAMS(proto), PARAMS(args)) /* If requested, create helpers for calling these tracepoints from Rust. */ @@ -115,6 +123,11 @@ #undef DECLARE_TRACE_CONDITION #define DECLARE_TRACE_CONDITION(name, proto, args, cond) +#undef DECLARE_TRACE_EVENT +#define DECLARE_TRACE_EVENT(name, proto, args) +#undef DECLARE_TRACE_EVENT_CONDITION +#define DECLARE_TRACE_EVENT_CONDITION(name, proto, args, cond) + #ifdef TRACEPOINTS_ENABLED #include #include @@ -136,6 +149,8 @@ #undef TRACE_HEADER_MULTI_READ #undef DECLARE_TRACE #undef DECLARE_TRACE_CONDITION +#undef DECLARE_TRACE_EVENT +#undef DECLARE_TRACE_EVENT_CONDITION /* Only undef what we defined in this file */ #ifdef UNDEF_TRACE_INCLUDE_FILE diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 8994e97d86c1..152fc8b37aa5 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -773,64 +773,64 @@ TRACE_EVENT(sched_wake_idle_without_ipi, * * Postfixed with _tp to make them easily identifiable in the code. */ -DECLARE_TRACE(pelt_cfs_tp, +DECLARE_TRACE(pelt_cfs, TP_PROTO(struct cfs_rq *cfs_rq), TP_ARGS(cfs_rq)); -DECLARE_TRACE(pelt_rt_tp, +DECLARE_TRACE(pelt_rt, TP_PROTO(struct rq *rq), TP_ARGS(rq)); -DECLARE_TRACE(pelt_dl_tp, +DECLARE_TRACE(pelt_dl, TP_PROTO(struct rq *rq), TP_ARGS(rq)); -DECLARE_TRACE(pelt_hw_tp, +DECLARE_TRACE(pelt_hw, TP_PROTO(struct rq *rq), TP_ARGS(rq)); -DECLARE_TRACE(pelt_irq_tp, +DECLARE_TRACE(pelt_irq, TP_PROTO(struct rq *rq), TP_ARGS(rq)); -DECLARE_TRACE(pelt_se_tp, +DECLARE_TRACE(pelt_se, TP_PROTO(struct sched_entity *se), TP_ARGS(se)); -DECLARE_TRACE(sched_cpu_capacity_tp, +DECLARE_TRACE(sched_cpu_capacity, TP_PROTO(struct rq *rq), TP_ARGS(rq)); -DECLARE_TRACE(sched_overutilized_tp, +DECLARE_TRACE(sched_overutilized, TP_PROTO(struct root_domain *rd, bool overutilized), TP_ARGS(rd, overutilized)); -DECLARE_TRACE(sched_util_est_cfs_tp, +DECLARE_TRACE(sched_util_est_cfs, TP_PROTO(struct cfs_rq *cfs_rq), TP_ARGS(cfs_rq)); -DECLARE_TRACE(sched_util_est_se_tp, +DECLARE_TRACE(sched_util_est_se, TP_PROTO(struct sched_entity *se), TP_ARGS(se)); -DECLARE_TRACE(sched_update_nr_running_tp, +DECLARE_TRACE(sched_update_nr_running, TP_PROTO(struct rq *rq, int change), TP_ARGS(rq, change)); -DECLARE_TRACE(sched_compute_energy_tp, +DECLARE_TRACE(sched_compute_energy, TP_PROTO(struct task_struct *p, int dst_cpu, unsigned long energy, unsigned long max_util, unsigned long busy_time), TP_ARGS(p, dst_cpu, energy, max_util, busy_time)); -DECLARE_TRACE(sched_entry_tp, +DECLARE_TRACE(sched_entry, TP_PROTO(bool preempt, unsigned long ip), TP_ARGS(preempt, ip)); -DECLARE_TRACE(sched_exit_tp, +DECLARE_TRACE(sched_exit, TP_PROTO(bool is_switch, unsigned long ip), TP_ARGS(is_switch, ip)); -DECLARE_TRACE_CONDITION(sched_set_state_tp, +DECLARE_TRACE_CONDITION(sched_set_state, TP_PROTO(struct task_struct *tsk, int state), TP_ARGS(tsk, state), TP_CONDITION(!!(tsk->__state) != !!state)); diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 1a40c41ff8c3..4f9fa1b5b89b 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -259,7 +259,7 @@ TRACE_EVENT(tcp_retransmit_synack, __entry->saddr_v6, __entry->daddr_v6) ); -DECLARE_TRACE(tcp_cwnd_reduction_tp, +DECLARE_TRACE(tcp_cwnd_reduction, TP_PROTO(const struct sock *sk, int newly_acked_sacked, int newly_lost, int flag), TP_ARGS(sk, newly_acked_sacked, newly_lost, flag) diff --git a/tools/testing/selftests/bpf/progs/raw_tp_null.c b/tools/testing/selftests/bpf/progs/raw_tp_null.c index 5927054b6dd9..efa416f53968 100644 --- a/tools/testing/selftests/bpf/progs/raw_tp_null.c +++ b/tools/testing/selftests/bpf/progs/raw_tp_null.c @@ -10,7 +10,7 @@ char _license[] SEC("license") = "GPL"; int tid; int i; -SEC("tp_btf/bpf_testmod_test_raw_tp_null") +SEC("tp_btf/bpf_testmod_test_raw_tp_null_tp") int BPF_PROG(test_raw_tp_null, struct sk_buff *skb) { struct task_struct *task = bpf_get_current_task_btf(); diff --git a/tools/testing/selftests/bpf/progs/raw_tp_null_fail.c b/tools/testing/selftests/bpf/progs/raw_tp_null_fail.c index 38d669957bf1..0d58114a4955 100644 --- a/tools/testing/selftests/bpf/progs/raw_tp_null_fail.c +++ b/tools/testing/selftests/bpf/progs/raw_tp_null_fail.c @@ -8,7 +8,7 @@ char _license[] SEC("license") = "GPL"; /* Ensure module parameter has PTR_MAYBE_NULL */ -SEC("tp_btf/bpf_testmod_test_raw_tp_null") +SEC("tp_btf/bpf_testmod_test_raw_tp_null_tp") __failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'") int test_raw_tp_null_bpf_testmod_test_raw_tp_null_arg_1(void *ctx) { asm volatile("r1 = *(u64 *)(r1 +0); r1 = *(u64 *)(r1 +0);" ::: __clobber_all); diff --git a/tools/testing/selftests/bpf/progs/test_module_attach.c b/tools/testing/selftests/bpf/progs/test_module_attach.c index 7f3c233943b3..03d7f89787a1 100644 --- a/tools/testing/selftests/bpf/progs/test_module_attach.c +++ b/tools/testing/selftests/bpf/progs/test_module_attach.c @@ -19,7 +19,7 @@ int BPF_PROG(handle_raw_tp, __u32 raw_tp_bare_write_sz = 0; -SEC("raw_tp/bpf_testmod_test_write_bare") +SEC("raw_tp/bpf_testmod_test_write_bare_tp") int BPF_PROG(handle_raw_tp_bare, struct task_struct *task, struct bpf_testmod_test_write_ctx *write_ctx) { @@ -31,7 +31,7 @@ int raw_tp_writable_bare_in_val = 0; int raw_tp_writable_bare_early_ret = 0; int raw_tp_writable_bare_out_val = 0; -SEC("raw_tp.w/bpf_testmod_test_writable_bare") +SEC("raw_tp.w/bpf_testmod_test_writable_bare_tp") int BPF_PROG(handle_raw_tp_writable_bare, struct bpf_testmod_test_writable_ctx *writable) { diff --git a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c index 39ff06f2c834..cf0547a613ff 100644 --- a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c +++ b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c @@ -6,14 +6,14 @@ #include "../test_kmods/bpf_testmod.h" #include "bpf_misc.h" -SEC("tp_btf/bpf_testmod_test_nullable_bare") +SEC("tp_btf/bpf_testmod_test_nullable_bare_tp") __failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'") int BPF_PROG(handle_tp_btf_nullable_bare1, struct bpf_testmod_test_read_ctx *nullable_ctx) { return nullable_ctx->len; } -SEC("tp_btf/bpf_testmod_test_nullable_bare") +SEC("tp_btf/bpf_testmod_test_nullable_bare_tp") int BPF_PROG(handle_tp_btf_nullable_bare2, struct bpf_testmod_test_read_ctx *nullable_ctx) { if (nullable_ctx) diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 3220f1d28697..18eded4d1d15 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -413,7 +413,7 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, (void)bpf_testmod_test_arg_ptr_to_struct(&struct_arg1_2); - (void)trace_bpf_testmod_test_raw_tp_null(NULL); + (void)trace_bpf_testmod_test_raw_tp_null_tp(NULL); bpf_testmod_test_struct_ops3(); @@ -431,14 +431,14 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, if (bpf_testmod_loop_test(101) > 100) trace_bpf_testmod_test_read(current, &ctx); - trace_bpf_testmod_test_nullable_bare(NULL); + trace_bpf_testmod_test_nullable_bare_tp(NULL); /* Magic number to enable writable tp */ if (len == 64) { struct bpf_testmod_test_writable_ctx writable = { .val = 1024, }; - trace_bpf_testmod_test_writable_bare(&writable); + trace_bpf_testmod_test_writable_bare_tp(&writable); if (writable.early_ret) return snprintf(buf, len, "%d\n", writable.val); } @@ -470,7 +470,7 @@ bpf_testmod_test_write(struct file *file, struct kobject *kobj, .len = len, }; - trace_bpf_testmod_test_write_bare(current, &ctx); + trace_bpf_testmod_test_write_bare_tp(current, &ctx); return -EIO; /* always fail */ } -- cgit v1.2.3 From 155fd6c3e2f02efdc71a9b62888942efc217aff0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 7 May 2025 13:34:58 -0400 Subject: tracing/sched: Use __string() instead of fixed lengths for task->comm The sched_switch and sched_waking events hardcoded the length of the comm it recorded because these events were created before the dynamic strings were implemented. Unfortunately, several other events copied this method. As the size of the comm may change in the future, make the string dynamic. The dynamic string requires a 4 byte meta data to hold the size and offset of the string. The amount stored in the ring buffer will then be the strlen(comm) + 5 (for the \n), and aligned to 4 bytes if there's no other strings. This means that a task comm can have up to 10 characters before it requires another 4 bytes in the ring buffer. Most tasks are usually less than that, so this should not be a problem, and it also allows the name to be extended over the TASK_COMM_LEN [1] Note, sched_switch and the sched_waking trace events still hardcode the length, as there is tooling that still requires that. An effort to update the tooling will be made to allow this to change in the future. [1] https://lore.kernel.org/all/20250507110444.963779-1-bhupesh@igalia.com/ Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Bhupesh Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Cc: Juri Lelli Link: https://lore.kernel.org/20250507133458.51bafd95@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- include/trace/events/sched.h | 94 ++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 152fc8b37aa5..fadc7592372b 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -20,16 +20,16 @@ TRACE_EVENT(sched_kthread_stop, TP_ARGS(t), TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) + __string( comm, t->comm ) + __field( pid_t, pid ) ), TP_fast_assign( - memcpy(__entry->comm, t->comm, TASK_COMM_LEN); + __assign_str(comm); __entry->pid = t->pid; ), - TP_printk("comm=%s pid=%d", __entry->comm, __entry->pid) + TP_printk("comm=%s pid=%d", __get_str(comm), __entry->pid) ); /* @@ -276,15 +276,15 @@ TRACE_EVENT(sched_migrate_task, TP_ARGS(p, dest_cpu), TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - __field( int, orig_cpu ) - __field( int, dest_cpu ) + __string( comm, p->comm ) + __field( pid_t, pid ) + __field( int, prio ) + __field( int, orig_cpu ) + __field( int, dest_cpu ) ), TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __assign_str(comm); __entry->pid = p->pid; __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ __entry->orig_cpu = task_cpu(p); @@ -292,7 +292,7 @@ TRACE_EVENT(sched_migrate_task, ), TP_printk("comm=%s pid=%d prio=%d orig_cpu=%d dest_cpu=%d", - __entry->comm, __entry->pid, __entry->prio, + __get_str(comm), __entry->pid, __entry->prio, __entry->orig_cpu, __entry->dest_cpu) ); @@ -303,19 +303,19 @@ DECLARE_EVENT_CLASS(sched_process_template, TP_ARGS(p), TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) + __string( comm, p->comm ) + __field( pid_t, pid ) + __field( int, prio ) ), TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __assign_str(comm); __entry->pid = p->pid; __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ ), TP_printk("comm=%s pid=%d prio=%d", - __entry->comm, __entry->pid, __entry->prio) + __get_str(comm), __entry->pid, __entry->prio) ); /* @@ -349,19 +349,19 @@ TRACE_EVENT(sched_process_wait, TP_ARGS(pid), TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) + __string( comm, current->comm ) __field( pid_t, pid ) __field( int, prio ) ), TP_fast_assign( - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + __assign_str(comm); __entry->pid = pid_nr(pid); __entry->prio = current->prio; /* XXX SCHED_DEADLINE */ ), TP_printk("comm=%s pid=%d prio=%d", - __entry->comm, __entry->pid, __entry->prio) + __get_str(comm), __entry->pid, __entry->prio) ); /* @@ -374,22 +374,22 @@ TRACE_EVENT(sched_process_fork, TP_ARGS(parent, child), TP_STRUCT__entry( - __array( char, parent_comm, TASK_COMM_LEN ) - __field( pid_t, parent_pid ) - __array( char, child_comm, TASK_COMM_LEN ) - __field( pid_t, child_pid ) + __string( parent_comm, parent->comm ) + __field( pid_t, parent_pid ) + __string( child_comm, child->comm ) + __field( pid_t, child_pid ) ), TP_fast_assign( - memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN); + __assign_str(parent_comm); __entry->parent_pid = parent->pid; - memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN); + __assign_str(child_comm); __entry->child_pid = child->pid; ), TP_printk("comm=%s pid=%d child_comm=%s child_pid=%d", - __entry->parent_comm, __entry->parent_pid, - __entry->child_comm, __entry->child_pid) + __get_str(parent_comm), __entry->parent_pid, + __get_str(child_comm), __entry->child_pid) ); /* @@ -473,19 +473,19 @@ DECLARE_EVENT_CLASS_SCHEDSTAT(sched_stat_template, TP_ARGS(__perf_task(tsk), __perf_count(delay)), TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( u64, delay ) + __string( comm, tsk->comm ) + __field( pid_t, pid ) + __field( u64, delay ) ), TP_fast_assign( - memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __assign_str(comm); __entry->pid = tsk->pid; __entry->delay = delay; ), TP_printk("comm=%s pid=%d delay=%Lu [ns]", - __entry->comm, __entry->pid, + __get_str(comm), __entry->pid, (unsigned long long)__entry->delay) ); @@ -531,19 +531,19 @@ DECLARE_EVENT_CLASS(sched_stat_runtime, TP_ARGS(tsk, __perf_count(runtime)), TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( u64, runtime ) + __string( comm, tsk->comm ) + __field( pid_t, pid ) + __field( u64, runtime ) ), TP_fast_assign( - memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __assign_str(comm); __entry->pid = tsk->pid; __entry->runtime = runtime; ), TP_printk("comm=%s pid=%d runtime=%Lu [ns]", - __entry->comm, __entry->pid, + __get_str(comm), __entry->pid, (unsigned long long)__entry->runtime) ); @@ -562,14 +562,14 @@ TRACE_EVENT(sched_pi_setprio, TP_ARGS(tsk, pi_task), TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, oldprio ) - __field( int, newprio ) + __string( comm, tsk->comm ) + __field( pid_t, pid ) + __field( int, oldprio ) + __field( int, newprio ) ), TP_fast_assign( - memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __assign_str(comm); __entry->pid = tsk->pid; __entry->oldprio = tsk->prio; __entry->newprio = pi_task ? @@ -579,7 +579,7 @@ TRACE_EVENT(sched_pi_setprio, ), TP_printk("comm=%s pid=%d oldprio=%d newprio=%d", - __entry->comm, __entry->pid, + __get_str(comm), __entry->pid, __entry->oldprio, __entry->newprio) ); @@ -589,16 +589,16 @@ TRACE_EVENT(sched_process_hang, TP_ARGS(tsk), TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) + __string( comm, tsk->comm ) + __field( pid_t, pid ) ), TP_fast_assign( - memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __assign_str(comm); __entry->pid = tsk->pid; ), - TP_printk("comm=%s pid=%d", __entry->comm, __entry->pid) + TP_printk("comm=%s pid=%d", __get_str(comm), __entry->pid) ); #endif /* CONFIG_DETECT_HUNG_TASK */ -- cgit v1.2.3 From 2632a2013f58f0aab4b9fd042e67d78740ba0996 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 14 May 2025 16:04:18 +0900 Subject: tracing: Record trace_clock and recover when reboot Record trace_clock information in the trace_scratch area and recover the trace_clock when boot, so that reader can docode the timestamp correctly. Note that since most trace_clocks records the timestamp in nano- seconds, this is not a bug. But some trace_clock, like counter and tsc will record the counter value. Only for those trace_clock user needs this information. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/174720625803.1925039.1815089037443798944.stgit@mhiramat.tok.corp.google.com Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cf51c30b137f..2c1764ed87b0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6066,6 +6066,7 @@ struct trace_mod_entry { }; struct trace_scratch { + unsigned int clock_id; unsigned long text_addr; unsigned long nr_entries; struct trace_mod_entry entries[]; @@ -6181,6 +6182,7 @@ static void update_last_data(struct trace_array *tr) if (tr->scratch) { struct trace_scratch *tscratch = tr->scratch; + tscratch->clock_id = tr->clock_id; memset(tscratch->entries, 0, flex_array_size(tscratch, entries, tscratch->nr_entries)); tscratch->nr_entries = 0; @@ -7403,6 +7405,12 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr) tracing_reset_online_cpus(&tr->max_buffer); #endif + if (tr->scratch && !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) { + struct trace_scratch *tscratch = tr->scratch; + + tscratch->clock_id = i; + } + mutex_unlock(&trace_types_lock); return 0; @@ -9628,6 +9636,15 @@ static void setup_trace_scratch(struct trace_array *tr, /* Scan modules to make text delta for modules. */ module_for_each_mod(make_mod_delta, tr); + + /* Set trace_clock as the same of the previous boot. */ + if (tscratch->clock_id != tr->clock_id) { + if (tscratch->clock_id >= ARRAY_SIZE(trace_clocks) || + tracing_set_clock(tr, trace_clocks[tscratch->clock_id].name) < 0) { + pr_info("the previous trace_clock info is not valid."); + goto reset; + } + } return; reset: /* Invalid trace modules */ -- cgit v1.2.3 From 2fbdb6d8e03b70668c0876e635506540ae92ab05 Mon Sep 17 00:00:00 2001 From: Pan Taixi Date: Mon, 26 May 2025 09:37:31 +0800 Subject: tracing: Fix compilation warning on arm32 On arm32, size_t is defined to be unsigned int, while PAGE_SIZE is unsigned long. This hence triggers a compilation warning as min() asserts the type of two operands to be equal. Casting PAGE_SIZE to size_t solves this issue and works on other target architectures as well. Compilation warning details: kernel/trace/trace.c: In function 'tracing_splice_read_pipe': ./include/linux/minmax.h:20:28: warning: comparison of distinct pointer types lacks a cast (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) ^ ./include/linux/minmax.h:26:4: note: in expansion of macro '__typecheck' (__typecheck(x, y) && __no_side_effects(x, y)) ^~~~~~~~~~~ ... kernel/trace/trace.c:6771:8: note: in expansion of macro 'min' min((size_t)trace_seq_used(&iter->seq), ^~~ Cc: stable@vger.kernel.org Link: https://lore.kernel.org/20250526013731.1198030-1-pantaixi@huaweicloud.com Fixes: f5178c41bb43 ("tracing: Fix oob write in trace_seq_to_buffer()") Reviewed-by: Jeongjun Park Signed-off-by: Pan Taixi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2c1764ed87b0..b60d495c2a79 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6891,7 +6891,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, ret = trace_seq_to_buffer(&iter->seq, page_address(spd.pages[i]), min((size_t)trace_seq_used(&iter->seq), - PAGE_SIZE)); + (size_t)PAGE_SIZE)); if (ret < 0) { __free_page(spd.pages[i]); break; -- cgit v1.2.3