summaryrefslogtreecommitdiff
path: root/kernel/trace
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/bpf_trace.c12
-rw-r--r--kernel/trace/ring_buffer.c58
-rw-r--r--kernel/trace/trace.c6
-rw-r--r--kernel/trace/trace.h26
-rw-r--r--kernel/trace/trace_events_synth.c17
-rw-r--r--kernel/trace/trace_selftest.c9
6 files changed, 97 insertions, 31 deletions
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a6903912f7a0..818cf8ad6f76 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -181,6 +181,16 @@ bpf_probe_read_user_str_common(void *dst, u32 size,
{
int ret;
+ /*
+ * NB: We rely on strncpy_from_user() not copying junk past the NUL
+ * terminator into `dst`.
+ *
+ * strncpy_from_user() does long-sized strides in the fast path. If the
+ * strncpy does not mask out the bytes after the NUL in `unsafe_ptr`,
+ * then there could be junk after the NUL in `dst`. If user takes `dst`
+ * and keys a hash map with it, then semantically identical strings can
+ * occupy multiple entries in the map.
+ */
ret = strncpy_from_user_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
memset(dst, 0, size);
@@ -1198,7 +1208,7 @@ static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
*btf = bpf_get_btf_vmlinux();
if (IS_ERR_OR_NULL(*btf))
- return PTR_ERR(*btf);
+ return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL;
if (ptr->type_id > 0)
*btf_id = ptr->type_id;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7f45fd9d5a45..dc83b3fa9fe7 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -438,14 +438,16 @@ enum {
};
/*
* Used for which event context the event is in.
- * NMI = 0
- * IRQ = 1
- * SOFTIRQ = 2
- * NORMAL = 3
+ * TRANSITION = 0
+ * NMI = 1
+ * IRQ = 2
+ * SOFTIRQ = 3
+ * NORMAL = 4
*
* See trace_recursive_lock() comment below for more details.
*/
enum {
+ RB_CTX_TRANSITION,
RB_CTX_NMI,
RB_CTX_IRQ,
RB_CTX_SOFTIRQ,
@@ -3014,10 +3016,10 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
* a bit of overhead in something as critical as function tracing,
* we use a bitmask trick.
*
- * bit 0 = NMI context
- * bit 1 = IRQ context
- * bit 2 = SoftIRQ context
- * bit 3 = normal context.
+ * bit 1 = NMI context
+ * bit 2 = IRQ context
+ * bit 3 = SoftIRQ context
+ * bit 4 = normal context.
*
* This works because this is the order of contexts that can
* preempt other contexts. A SoftIRQ never preempts an IRQ
@@ -3040,6 +3042,30 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
* The least significant bit can be cleared this way, and it
* just so happens that it is the same bit corresponding to
* the current context.
+ *
+ * Now the TRANSITION bit breaks the above slightly. The TRANSITION bit
+ * is set when a recursion is detected at the current context, and if
+ * the TRANSITION bit is already set, it will fail the recursion.
+ * This is needed because there's a lag between the changing of
+ * interrupt context and updating the preempt count. In this case,
+ * a false positive will be found. To handle this, one extra recursion
+ * is allowed, and this is done by the TRANSITION bit. If the TRANSITION
+ * bit is already set, then it is considered a recursion and the function
+ * ends. Otherwise, the TRANSITION bit is set, and that bit is returned.
+ *
+ * On the trace_recursive_unlock(), the TRANSITION bit will be the first
+ * to be cleared. Even if it wasn't the context that set it. That is,
+ * if an interrupt comes in while NORMAL bit is set and the ring buffer
+ * is called before preempt_count() is updated, since the check will
+ * be on the NORMAL bit, the TRANSITION bit will then be set. If an
+ * NMI then comes in, it will set the NMI bit, but when the NMI code
+ * does the trace_recursive_unlock() it will clear the TRANSTION bit
+ * and leave the NMI bit set. But this is fine, because the interrupt
+ * code that set the TRANSITION bit will then clear the NMI bit when it
+ * calls trace_recursive_unlock(). If another NMI comes in, it will
+ * set the TRANSITION bit and continue.
+ *
+ * Note: The TRANSITION bit only handles a single transition between context.
*/
static __always_inline int
@@ -3055,8 +3081,16 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
bit = pc & NMI_MASK ? RB_CTX_NMI :
pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
- if (unlikely(val & (1 << (bit + cpu_buffer->nest))))
- return 1;
+ if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) {
+ /*
+ * It is possible that this was called by transitioning
+ * between interrupt context, and preempt_count() has not
+ * been updated yet. In this case, use the TRANSITION bit.
+ */
+ bit = RB_CTX_TRANSITION;
+ if (val & (1 << (bit + cpu_buffer->nest)))
+ return 1;
+ }
val |= (1 << (bit + cpu_buffer->nest));
cpu_buffer->current_context = val;
@@ -3071,8 +3105,8 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->current_context - (1 << cpu_buffer->nest);
}
-/* The recursive locking above uses 4 bits */
-#define NESTED_BITS 4
+/* The recursive locking above uses 5 bits */
+#define NESTED_BITS 5
/**
* ring_buffer_nest_start - Allow to trace while nested
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 528971714fc6..410cfeb16db5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2750,7 +2750,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
/*
* If tracing is off, but we have triggers enabled
* we still need to look at the event data. Use the temp_buffer
- * to store the trace event for the tigger to use. It's recusive
+ * to store the trace event for the trigger to use. It's recursive
* safe and will not be recorded anywhere.
*/
if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) {
@@ -2952,7 +2952,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;
/* This should never happen. If it does, yell once and skip */
- if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING))
+ if (WARN_ON_ONCE(stackidx >= FTRACE_KSTACK_NESTING))
goto out;
/*
@@ -3132,7 +3132,7 @@ static char *get_trace_buf(void)
/* Interrupts must see nesting incremented before we use the buffer */
barrier();
- return &buffer->buffer[buffer->nesting][0];
+ return &buffer->buffer[buffer->nesting - 1][0];
}
static void put_trace_buf(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f3f5e77123ad..1dadef445cd1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -637,6 +637,12 @@ enum {
* function is called to clear it.
*/
TRACE_GRAPH_NOTRACE_BIT,
+
+ /*
+ * When transitioning between context, the preempt_count() may
+ * not be correct. Allow for a single recursion to cover this case.
+ */
+ TRACE_TRANSITION_BIT,
};
#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0)
@@ -691,14 +697,27 @@ static __always_inline int trace_test_and_set_recursion(int start, int max)
return 0;
bit = trace_get_context_bit() + start;
- if (unlikely(val & (1 << bit)))
- return -1;
+ if (unlikely(val & (1 << bit))) {
+ /*
+ * It could be that preempt_count has not been updated during
+ * a switch between contexts. Allow for a single recursion.
+ */
+ bit = TRACE_TRANSITION_BIT;
+ if (trace_recursion_test(bit))
+ return -1;
+ trace_recursion_set(bit);
+ barrier();
+ return bit + 1;
+ }
+
+ /* Normal check passed, clear the transition to allow it again */
+ trace_recursion_clear(TRACE_TRANSITION_BIT);
val |= 1 << bit;
current->trace_recursion = val;
barrier();
- return bit;
+ return bit + 1;
}
static __always_inline void trace_clear_recursion(int bit)
@@ -708,6 +727,7 @@ static __always_inline void trace_clear_recursion(int bit)
if (!bit)
return;
+ bit--;
bit = 1 << bit;
val &= ~bit;
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 84b7cab55291..881df991742a 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -584,7 +584,7 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
{
struct synth_field *field;
const char *prefix = NULL, *field_type = argv[0], *field_name, *array;
- int len, ret = 0;
+ int len, ret = -ENOMEM;
struct seq_buf s;
ssize_t size;
@@ -617,10 +617,9 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
len--;
field->name = kmemdup_nul(field_name, len, GFP_KERNEL);
- if (!field->name) {
- ret = -ENOMEM;
+ if (!field->name)
goto free;
- }
+
if (!is_good_name(field->name)) {
synth_err(SYNTH_ERR_BAD_NAME, errpos(field_name));
ret = -EINVAL;
@@ -638,10 +637,9 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
len += strlen(prefix);
field->type = kzalloc(len, GFP_KERNEL);
- if (!field->type) {
- ret = -ENOMEM;
+ if (!field->type)
goto free;
- }
+
seq_buf_init(&s, field->type, len);
if (prefix)
seq_buf_puts(&s, prefix);
@@ -653,6 +651,7 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
}
if (WARN_ON_ONCE(!seq_buf_buffer_left(&s)))
goto free;
+
s.buffer[s.len] = '\0';
size = synth_field_size(field->type);
@@ -666,10 +665,8 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
len = sizeof("__data_loc ") + strlen(field->type) + 1;
type = kzalloc(len, GFP_KERNEL);
- if (!type) {
- ret = -ENOMEM;
+ if (!type)
goto free;
- }
seq_buf_init(&s, type, len);
seq_buf_puts(&s, "__data_loc ");
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index b5e3496cf803..4738ad48a667 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -492,8 +492,13 @@ trace_selftest_function_recursion(void)
unregister_ftrace_function(&test_rec_probe);
ret = -1;
- if (trace_selftest_recursion_cnt != 1) {
- pr_cont("*callback not called once (%d)* ",
+ /*
+ * Recursion allows for transitions between context,
+ * and may call the callback twice.
+ */
+ if (trace_selftest_recursion_cnt != 1 &&
+ trace_selftest_recursion_cnt != 2) {
+ pr_cont("*callback not called once (or twice) (%d)* ",
trace_selftest_recursion_cnt);
goto out;
}