From 6496bb72bf20c1c7e4d6be44dfa663163e709116 Mon Sep 17 00:00:00 2001 From: Kenny Yu Date: Fri, 13 Jan 2017 08:58:34 -0800 Subject: uprobe: Find last occurrence of ':' when parsing uprobe PATH:OFFSET Previously, `create_trace_uprobe` found the *first* occurence of the ':' character when parsing `PATH:OFFSET` for a uprobe. However, if the path contains a ':' character, then the function would parse the path incorrectly. Even worse, if the path does not exist, the subsequent call to `kern_path()` would set `ret` to `ENOENT`, leading to very cryptic errno values in user space. The fix is to find the *last* occurence of ':'. How to repro:: The write fails with "No such file or directory", suggesting incorrectly that the `uprobe_events` file does not exist. $ mkdir testing && cd testing $ cp /bin/bash . $ cp /bin/bash ./bash:with:colon $ echo "p:uprobes/p__root_testing_bash_0x6 /root/testing/bash:0x6" > /sys/kernel/debug/tracing/uprobe_events # this works $ echo "p:uprobes/p__root_testing_bash_with_colon_0x6 /root/testing/bash:with:colon:0x6" >> /sys/kernel/debug/tracing/uprobe_events # this doesn't -bash: echo: write error: No such file or directory With the patch: $ echo "p:uprobes/p__root_testing_bash_0x6 /root/testing/bash:0x6" > /sys/kernel/debug/tracing/uprobe_events # this still works $ echo "p:uprobes/p__root_testing_bash_with_colon_0x6 /root/testing/bash:with:colon:0x6" >> /sys/kernel/debug/tracing/uprobe_events # this works now too! $ cat /sys/kernel/debug/tracing/uprobe_events p:uprobes/p__root_testing_bash_0x6 /root/testing/bash:0x0000000000000006 p:uprobes/p__root_testing_bash_with_colon_0x6 /root/testing/bash:with:colon:0x0000000000000006 Link: http://lkml.kernel.org/r/20170113165834.4081016-1-kennyyu@fb.com Signed-off-by: Kenny Yu Reviewed-by: Omar Sandoval Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 0913693caf6e..4f2ba2bb11e0 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -431,7 +431,8 @@ static int create_trace_uprobe(int argc, char **argv) pr_info("Probe point is not specified.\n"); return -EINVAL; } - arg = strchr(argv[1], ':'); + /* Find the last occurrence, in case the path contains ':' too. */ + arg = strrchr(argv[1], ':'); if (!arg) { ret = -EINVAL; goto fail_address_parse; -- cgit v1.2.3-58-ga151 From d45ae1f7041ac52ade6c5ec76d96bbed765d67aa Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 17 Jan 2017 12:29:35 -0500 Subject: tracing: Process constants for (un)likely() profiler When running the likely/unlikely profiler, one of the results did not look accurate. It noted that the unlikely() in link_path_walk() was 100% incorrect. When I added a trace_printk() to see what was happening there, it became 80% correct! Looking deeper into what whas happening, I found that gcc split that if statement into two paths. One where the if statement became a constant, the other path a variable. The other path had the if statement always hit (making the unlikely there, always false), but since the #define unlikely() has: #define unlikely() (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0)) Where constants are ignored by the branch profiler, the "constant" path made by the compiler was ignored, even though it was hit 80% of the time. By just passing the constant value to the __branch_check__() function and tracing it out of line (as always correct, as likely/unlikely isn't a factor for constants), then we get back the accurate readings of branches that were optimized by gcc causing part of the execution to become constant. Signed-off-by: Steven Rostedt (VMware) --- include/linux/compiler.h | 14 ++++++++------ kernel/trace/trace_branch.c | 6 +++++- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index cf0fa5d86059..bbbe1570de1c 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -107,12 +107,13 @@ struct ftrace_branch_data { */ #if defined(CONFIG_TRACE_BRANCH_PROFILING) \ && !defined(DISABLE_BRANCH_PROFILING) && !defined(__CHECKER__) -void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); +void ftrace_likely_update(struct ftrace_branch_data *f, int val, + int expect, int is_constant); #define likely_notrace(x) __builtin_expect(!!(x), 1) #define unlikely_notrace(x) __builtin_expect(!!(x), 0) -#define __branch_check__(x, expect) ({ \ +#define __branch_check__(x, expect, is_constant) ({ \ int ______r; \ static struct ftrace_branch_data \ __attribute__((__aligned__(4))) \ @@ -122,8 +123,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); .file = __FILE__, \ .line = __LINE__, \ }; \ - ______r = likely_notrace(x); \ - ftrace_likely_update(&______f, ______r, expect); \ + ______r = __builtin_expect(!!(x), expect); \ + ftrace_likely_update(&______f, ______r, \ + expect, is_constant); \ ______r; \ }) @@ -133,10 +135,10 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); * written by Daniel Walker. */ # ifndef likely -# define likely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 1)) +# define likely(x) (__branch_check__(x, 1, __builtin_constant_p(x))) # endif # ifndef unlikely -# define unlikely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0)) +# define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x))) # endif #ifdef CONFIG_PROFILE_ALL_BRANCHES diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 75489de546b6..7afe426ea528 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -200,8 +200,12 @@ void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) } #endif /* CONFIG_BRANCH_TRACER */ -void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect) +void ftrace_likely_update(struct ftrace_branch_data *f, int val, + int expect, int is_constant) { + /* A constant is always correct */ + if (is_constant) + val = expect; /* * I would love to have a trace point here instead, but the * trace point code is so inundated with unlikely and likely -- cgit v1.2.3-58-ga151 From 134e6a034cb004ed5acd3048792de70ced1c6cf5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 19 Jan 2017 08:57:14 -0500 Subject: tracing: Show number of constants profiled in likely profiler Now that constants are traced, it is useful to see the number of constants that are traced in the likely/unlikely profiler in order to know if they should be ignored or not. The likely/unlikely will display a number after the "correct" number if a "constant" count exists. Signed-off-by: Steven Rostedt (VMware) --- include/linux/compiler.h | 15 ++++++---- kernel/trace/trace_branch.c | 68 ++++++++++++++++++++++++++++++++++++--------- 2 files changed, 65 insertions(+), 18 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index bbbe1570de1c..a73cc9afa784 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -101,13 +101,18 @@ struct ftrace_branch_data { }; }; +struct ftrace_likely_data { + struct ftrace_branch_data data; + unsigned long constant; +}; + /* * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code * to disable branch tracing on a per file basis. */ #if defined(CONFIG_TRACE_BRANCH_PROFILING) \ && !defined(DISABLE_BRANCH_PROFILING) && !defined(__CHECKER__) -void ftrace_likely_update(struct ftrace_branch_data *f, int val, +void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect, int is_constant); #define likely_notrace(x) __builtin_expect(!!(x), 1) @@ -115,13 +120,13 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, #define __branch_check__(x, expect, is_constant) ({ \ int ______r; \ - static struct ftrace_branch_data \ + static struct ftrace_likely_data \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_annotated_branch"))) \ ______f = { \ - .func = __func__, \ - .file = __FILE__, \ - .line = __LINE__, \ + .data.func = __func__, \ + .data.file = __FILE__, \ + .data.line = __LINE__, \ }; \ ______r = __builtin_expect(!!(x), expect); \ ftrace_likely_update(&______f, ______r, \ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 7afe426ea528..fd483d74f8e1 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -200,25 +200,27 @@ void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) } #endif /* CONFIG_BRANCH_TRACER */ -void ftrace_likely_update(struct ftrace_branch_data *f, int val, +void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect, int is_constant) { /* A constant is always correct */ - if (is_constant) + if (is_constant) { + f->constant++; val = expect; + } /* * I would love to have a trace point here instead, but the * trace point code is so inundated with unlikely and likely * conditions that the recursive nightmare that exists is too * much to try to get working. At least for now. */ - trace_likely_condition(f, val, expect); + trace_likely_condition(&f->data, val, expect); /* FIXME: Make this atomic! */ if (val == expect) - f->correct++; + f->data.correct++; else - f->incorrect++; + f->data.incorrect++; } EXPORT_SYMBOL(ftrace_likely_update); @@ -249,29 +251,60 @@ static inline long get_incorrect_percent(struct ftrace_branch_data *p) return percent; } -static int branch_stat_show(struct seq_file *m, void *v) +static const char *branch_stat_process_file(struct ftrace_branch_data *p) { - struct ftrace_branch_data *p = v; const char *f; - long percent; /* Only print the file, not the path */ f = p->file + strlen(p->file); while (f >= p->file && *f != '/') f--; - f++; + return ++f; +} + +static void branch_stat_show(struct seq_file *m, + struct ftrace_branch_data *p, const char *f) +{ + long percent; /* * The miss is overlayed on correct, and hit on incorrect. */ percent = get_incorrect_percent(p); - seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); if (percent < 0) seq_puts(m, " X "); else seq_printf(m, "%3ld ", percent); + seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); +} + +static int branch_stat_show_normal(struct seq_file *m, + struct ftrace_branch_data *p, const char *f) +{ + seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); + branch_stat_show(m, p, f); + return 0; +} + +static int annotate_branch_stat_show(struct seq_file *m, void *v) +{ + struct ftrace_likely_data *p = v; + const char *f; + int l; + + f = branch_stat_process_file(&p->data); + + if (!p->constant) + return branch_stat_show_normal(m, &p->data, f); + + l = snprintf(NULL, 0, "/%lu", p->constant); + l = l > 8 ? 0 : 8 - l; + + seq_printf(m, "%8lu/%lu %*lu ", + p->data.correct, p->constant, l, p->data.incorrect); + branch_stat_show(m, &p->data, f); return 0; } @@ -283,7 +316,7 @@ static void *annotated_branch_stat_start(struct tracer_stat *trace) static void * annotated_branch_stat_next(void *v, int idx) { - struct ftrace_branch_data *p = v; + struct ftrace_likely_data *p = v; ++p; @@ -332,7 +365,7 @@ static struct tracer_stat annotated_branch_stats = { .stat_next = annotated_branch_stat_next, .stat_cmp = annotated_branch_stat_cmp, .stat_headers = annotated_branch_stat_headers, - .stat_show = branch_stat_show + .stat_show = annotate_branch_stat_show }; __init static int init_annotated_branch_stats(void) @@ -383,12 +416,21 @@ all_branch_stat_next(void *v, int idx) return p; } +static int all_branch_stat_show(struct seq_file *m, void *v) +{ + struct ftrace_branch_data *p = v; + const char *f; + + f = branch_stat_process_file(p); + return branch_stat_show_normal(m, p, f); +} + static struct tracer_stat all_branch_stats = { .name = "branch_all", .stat_start = all_branch_stat_start, .stat_next = all_branch_stat_next, .stat_headers = all_branch_stat_headers, - .stat_show = branch_stat_show + .stat_show = all_branch_stat_show }; __init static int all_annotated_branch_stats(void) -- cgit v1.2.3-58-ga151 From 068f530b3f274d313395663bf8d674798d4858c6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 19 Jan 2017 08:57:41 -0500 Subject: tracing: Add the constant count for branch tracer The unlikely/likely branch profiler now gets called even if the if statement is a constant (always goes in one direction without a compare). Add a value to denote this in the likely/unlikely tracer as well. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_branch.c | 17 +++++++++-------- kernel/trace/trace_entries.h | 6 ++++-- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index fd483d74f8e1..4d8fdf3184dc 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -27,7 +27,7 @@ static DEFINE_MUTEX(branch_tracing_mutex); static struct trace_array *branch_tracer; static void -probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) +probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) { struct trace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; @@ -68,16 +68,17 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry = ring_buffer_event_data(event); /* Strip off the path, only save the file */ - p = f->file + strlen(f->file); - while (p >= f->file && *p != '/') + p = f->data.file + strlen(f->data.file); + while (p >= f->data.file && *p != '/') p--; p++; - strncpy(entry->func, f->func, TRACE_FUNC_SIZE); + strncpy(entry->func, f->data.func, TRACE_FUNC_SIZE); strncpy(entry->file, p, TRACE_FILE_SIZE); entry->func[TRACE_FUNC_SIZE] = 0; entry->file[TRACE_FILE_SIZE] = 0; - entry->line = f->line; + entry->constant = f->constant; + entry->line = f->data.line; entry->correct = val == expect; if (!call_filter_check_discard(call, entry, buffer, event)) @@ -89,7 +90,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) } static inline -void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) +void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect) { if (!branch_tracing_enabled) return; @@ -195,7 +196,7 @@ core_initcall(init_branch_tracer); #else static inline -void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) +void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect) { } #endif /* CONFIG_BRANCH_TRACER */ @@ -214,7 +215,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, * conditions that the recursive nightmare that exists is too * much to try to get working. At least for now. */ - trace_likely_condition(&f->data, val, expect); + trace_likely_condition(f, val, expect); /* FIXME: Make this atomic! */ if (val == expect) diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index eb7396b7e7c3..c203ac4df791 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -328,11 +328,13 @@ FTRACE_ENTRY(branch, trace_branch, __array( char, func, TRACE_FUNC_SIZE+1 ) __array( char, file, TRACE_FILE_SIZE+1 ) __field( char, correct ) + __field( char, constant ) ), - F_printk("%u:%s:%s (%u)", + F_printk("%u:%s:%s (%u)%s", __entry->line, - __entry->func, __entry->file, __entry->correct), + __entry->func, __entry->file, __entry->correct, + __entry->constant ? " CONSTANT" : ""), FILTER_OTHER ); -- cgit v1.2.3-58-ga151 From 3e278c0dc1cf5070d9462ececde1c07369b469b2 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 20 Jan 2017 11:44:45 +0900 Subject: ftrace: Factor out __ftrace_hash_move() The __ftrace_hash_move() is to allocates properly-sized hash and move entries in the src ftrace_hash. It will be used to set function graph filters which has nothing to do with the dyn_ftrace records. Link: http://lkml.kernel.org/r/20170120024447.26097-1-namhyung@kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eb230f06ba41..37b0e948d924 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1383,9 +1383,8 @@ ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, struct ftrace_hash *new_hash); -static int -ftrace_hash_move(struct ftrace_ops *ops, int enable, - struct ftrace_hash **dst, struct ftrace_hash *src) +static struct ftrace_hash * +__ftrace_hash_move(struct ftrace_hash *src) { struct ftrace_func_entry *entry; struct hlist_node *tn; @@ -1393,21 +1392,13 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_hash *new_hash; int size = src->count; int bits = 0; - int ret; int i; - /* Reject setting notrace hash on IPMODIFY ftrace_ops */ - if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable) - return -EINVAL; - /* - * If the new source is empty, just free dst and assign it - * the empty_hash. + * If the new source is empty, just return the empty_hash. */ - if (!src->count) { - new_hash = EMPTY_HASH; - goto update; - } + if (!src->count) + return EMPTY_HASH; /* * Make the hash size about 1/2 the # found @@ -1421,7 +1412,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, new_hash = alloc_ftrace_hash(bits); if (!new_hash) - return -ENOMEM; + return NULL; size = 1 << src->size_bits; for (i = 0; i < size; i++) { @@ -1432,7 +1423,24 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, } } -update: + return new_hash; +} + +static int +ftrace_hash_move(struct ftrace_ops *ops, int enable, + struct ftrace_hash **dst, struct ftrace_hash *src) +{ + struct ftrace_hash *new_hash; + int ret; + + /* Reject setting notrace hash on IPMODIFY ftrace_ops */ + if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable) + return -EINVAL; + + new_hash = __ftrace_hash_move(src); + if (!new_hash) + return -ENOMEM; + /* Make sure this can be applied if it is IPMODIFY ftrace_ops */ if (enable) { /* IPMODIFY should be updated only when filter_hash updating */ -- cgit v1.2.3-58-ga151 From 4046bf023b0647d09704a32d9fe8aecbcee3e4c3 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 20 Jan 2017 11:44:46 +0900 Subject: ftrace: Expose ftrace_hash_empty and ftrace_lookup_ip It will be used when checking graph filter hashes later. Link: http://lkml.kernel.org/r/20170120024447.26097-2-namhyung@kernel.org Signed-off-by: Namhyung Kim [ Moved ftrace_hash dec and functions outside of FUNCTION_GRAPH define ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 14 +------------- kernel/trace/trace.h | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 37b0e948d924..0470e373b9b4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1110,13 +1110,6 @@ struct ftrace_func_entry { unsigned long ip; }; -struct ftrace_hash { - unsigned long size_bits; - struct hlist_head *buckets; - unsigned long count; - struct rcu_head rcu; -}; - /* * We make these constant because no one should touch them, * but they are used as the default "empty hash", to avoid allocating @@ -1192,12 +1185,7 @@ struct ftrace_page { static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; -static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash) -{ - return !hash || !hash->count; -} - -static struct ftrace_func_entry * +struct ftrace_func_entry * ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) { unsigned long key; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1ea51ab53edf..9001460291bb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -753,6 +753,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter); extern char trace_find_mark(unsigned long long duration); +struct ftrace_hash { + unsigned long size_bits; + struct hlist_head *buckets; + unsigned long count; + struct rcu_head rcu; +}; + +struct ftrace_func_entry * +ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip); + +static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash) +{ + return !hash || !hash->count; +} + /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -787,7 +802,6 @@ extern void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned long flags, int pc); - #ifdef CONFIG_DYNAMIC_FTRACE /* TODO: make this variable */ #define FTRACE_GRAPH_MAX_FUNCS 32 -- cgit v1.2.3-58-ga151 From b9b0c831bed2682c2e3e9f5420fb6985549ef020 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 20 Jan 2017 11:44:47 +0900 Subject: ftrace: Convert graph filter to use hash tables Use ftrace_hash instead of a static array of a fixed size. This is useful when a graph filter pattern matches to a large number of functions. Now hash lookup is done with preemption disabled to protect from the hash being changed/freed. Link: http://lkml.kernel.org/r/20170120024447.26097-3-namhyung@kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 190 +++++++++++++++++++++++++++++++++----------------- kernel/trace/trace.h | 64 ++++++++--------- 2 files changed, 158 insertions(+), 96 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0470e373b9b4..2d554a02241d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4378,7 +4378,7 @@ __setup("ftrace_filter=", set_ftrace_filter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; -static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); +static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); static unsigned long save_global_trampoline; static unsigned long save_global_flags; @@ -4401,18 +4401,17 @@ static void __init set_ftrace_early_graph(char *buf, int enable) { int ret; char *func; - unsigned long *table = ftrace_graph_funcs; - int *count = &ftrace_graph_count; + struct ftrace_hash *hash; - if (!enable) { - table = ftrace_graph_notrace_funcs; - count = &ftrace_graph_notrace_count; - } + if (enable) + hash = ftrace_graph_hash; + else + hash = ftrace_graph_notrace_hash; while (buf) { func = strsep(&buf, ","); /* we allow only one expression at a time */ - ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func); + ret = ftrace_graph_set_hash(hash, func); if (ret) printk(KERN_DEBUG "ftrace: function %s not " "traceable\n", func); @@ -4536,15 +4535,20 @@ static const struct file_operations ftrace_notrace_fops = { static DEFINE_MUTEX(graph_lock); -int ftrace_graph_count; -int ftrace_graph_notrace_count; -unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; -unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; +struct ftrace_hash *ftrace_graph_hash = EMPTY_HASH; +struct ftrace_hash *ftrace_graph_notrace_hash = EMPTY_HASH; + +enum graph_filter_type { + GRAPH_FILTER_NOTRACE = 0, + GRAPH_FILTER_FUNCTION, +}; struct ftrace_graph_data { - unsigned long *table; - size_t size; - int *count; + struct ftrace_hash *hash; + struct ftrace_func_entry *entry; + int idx; /* for hash table iteration */ + enum graph_filter_type type; + struct ftrace_hash *new_hash; const struct seq_operations *seq_ops; }; @@ -4552,10 +4556,31 @@ static void * __g_next(struct seq_file *m, loff_t *pos) { struct ftrace_graph_data *fgd = m->private; + struct ftrace_func_entry *entry = fgd->entry; + struct hlist_head *head; + int i, idx = fgd->idx; - if (*pos >= *fgd->count) + if (*pos >= fgd->hash->count) return NULL; - return &fgd->table[*pos]; + + if (entry) { + hlist_for_each_entry_continue(entry, hlist) { + fgd->entry = entry; + return entry; + } + + idx++; + } + + for (i = idx; i < 1 << fgd->hash->size_bits; i++) { + head = &fgd->hash->buckets[i]; + hlist_for_each_entry(entry, head, hlist) { + fgd->entry = entry; + fgd->idx = i; + return entry; + } + } + return NULL; } static void * @@ -4572,9 +4597,11 @@ static void *g_start(struct seq_file *m, loff_t *pos) mutex_lock(&graph_lock); /* Nothing, tell g_show to print all functions are enabled */ - if (!*fgd->count && !*pos) + if (ftrace_hash_empty(fgd->hash) && !*pos) return (void *)1; + fgd->idx = 0; + fgd->entry = NULL; return __g_next(m, pos); } @@ -4585,22 +4612,22 @@ static void g_stop(struct seq_file *m, void *p) static int g_show(struct seq_file *m, void *v) { - unsigned long *ptr = v; + struct ftrace_func_entry *entry = v; - if (!ptr) + if (!entry) return 0; - if (ptr == (unsigned long *)1) { + if (entry == (void *)1) { struct ftrace_graph_data *fgd = m->private; - if (fgd->table == ftrace_graph_funcs) + if (fgd->type == GRAPH_FILTER_FUNCTION) seq_puts(m, "#### all functions enabled ####\n"); else seq_puts(m, "#### no functions disabled ####\n"); return 0; } - seq_printf(m, "%ps\n", (void *)*ptr); + seq_printf(m, "%ps\n", (void *)entry->ip); return 0; } @@ -4617,24 +4644,37 @@ __ftrace_graph_open(struct inode *inode, struct file *file, struct ftrace_graph_data *fgd) { int ret = 0; + struct ftrace_hash *new_hash = NULL; - mutex_lock(&graph_lock); - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) { - *fgd->count = 0; - memset(fgd->table, 0, fgd->size * sizeof(*fgd->table)); + if (file->f_mode & FMODE_WRITE) { + const int size_bits = FTRACE_HASH_DEFAULT_BITS; + + if (file->f_flags & O_TRUNC) + new_hash = alloc_ftrace_hash(size_bits); + else + new_hash = alloc_and_copy_ftrace_hash(size_bits, + fgd->hash); + if (!new_hash) { + ret = -ENOMEM; + goto out; + } } - mutex_unlock(&graph_lock); if (file->f_mode & FMODE_READ) { - ret = seq_open(file, fgd->seq_ops); + ret = seq_open(file, &ftrace_graph_seq_ops); if (!ret) { struct seq_file *m = file->private_data; m->private = fgd; + } else { + /* Failed */ + free_ftrace_hash(new_hash); + new_hash = NULL; } } else file->private_data = fgd; +out: + fgd->new_hash = new_hash; return ret; } @@ -4642,6 +4682,7 @@ static int ftrace_graph_open(struct inode *inode, struct file *file) { struct ftrace_graph_data *fgd; + int ret; if (unlikely(ftrace_disabled)) return -ENODEV; @@ -4650,18 +4691,25 @@ ftrace_graph_open(struct inode *inode, struct file *file) if (fgd == NULL) return -ENOMEM; - fgd->table = ftrace_graph_funcs; - fgd->size = FTRACE_GRAPH_MAX_FUNCS; - fgd->count = &ftrace_graph_count; + mutex_lock(&graph_lock); + + fgd->hash = ftrace_graph_hash; + fgd->type = GRAPH_FILTER_FUNCTION; fgd->seq_ops = &ftrace_graph_seq_ops; - return __ftrace_graph_open(inode, file, fgd); + ret = __ftrace_graph_open(inode, file, fgd); + if (ret < 0) + kfree(fgd); + + mutex_unlock(&graph_lock); + return ret; } static int ftrace_graph_notrace_open(struct inode *inode, struct file *file) { struct ftrace_graph_data *fgd; + int ret; if (unlikely(ftrace_disabled)) return -ENODEV; @@ -4670,45 +4718,53 @@ ftrace_graph_notrace_open(struct inode *inode, struct file *file) if (fgd == NULL) return -ENOMEM; - fgd->table = ftrace_graph_notrace_funcs; - fgd->size = FTRACE_GRAPH_MAX_FUNCS; - fgd->count = &ftrace_graph_notrace_count; + mutex_lock(&graph_lock); + + fgd->hash = ftrace_graph_notrace_hash; + fgd->type = GRAPH_FILTER_NOTRACE; fgd->seq_ops = &ftrace_graph_seq_ops; - return __ftrace_graph_open(inode, file, fgd); + ret = __ftrace_graph_open(inode, file, fgd); + if (ret < 0) + kfree(fgd); + + mutex_unlock(&graph_lock); + return ret; } static int ftrace_graph_release(struct inode *inode, struct file *file) { + struct ftrace_graph_data *fgd; + if (file->f_mode & FMODE_READ) { struct seq_file *m = file->private_data; - kfree(m->private); + fgd = m->private; seq_release(inode, file); } else { - kfree(file->private_data); + fgd = file->private_data; } + kfree(fgd->new_hash); + kfree(fgd); + return 0; } static int -ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) +ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) { struct ftrace_glob func_g; struct dyn_ftrace *rec; struct ftrace_page *pg; + struct ftrace_func_entry *entry; int fail = 1; int not; - bool exists; - int i; /* decode regex */ func_g.type = filter_parse_regex(buffer, strlen(buffer), &func_g.search, ¬); - if (!not && *idx >= size) - return -EBUSY; func_g.len = strlen(func_g.search); @@ -4725,26 +4781,18 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) continue; if (ftrace_match_record(rec, &func_g, NULL, 0)) { - /* if it is in the array */ - exists = false; - for (i = 0; i < *idx; i++) { - if (array[i] == rec->ip) { - exists = true; - break; - } - } + entry = ftrace_lookup_ip(hash, rec->ip); if (!not) { fail = 0; - if (!exists) { - array[(*idx)++] = rec->ip; - if (*idx >= size) - goto out; - } + + if (entry) + continue; + if (add_hash_entry(hash, rec->ip) < 0) + goto out; } else { - if (exists) { - array[i] = array[--(*idx)]; - array[*idx] = 0; + if (entry) { + free_hash_entry(hash, entry); fail = 0; } } @@ -4766,6 +4814,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, struct trace_parser parser; ssize_t read, ret = 0; struct ftrace_graph_data *fgd = file->private_data; + struct ftrace_hash *old_hash, *new_hash; if (!cnt) return 0; @@ -4781,10 +4830,25 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, mutex_lock(&graph_lock); /* we allow only one expression at a time */ - ret = ftrace_set_func(fgd->table, fgd->count, fgd->size, - parser.buffer); + ret = ftrace_graph_set_hash(fgd->new_hash, + parser.buffer); + + old_hash = fgd->hash; + new_hash = __ftrace_hash_move(fgd->new_hash); + if (!new_hash) + ret = -ENOMEM; + + if (fgd->type == GRAPH_FILTER_FUNCTION) + rcu_assign_pointer(ftrace_graph_hash, new_hash); + else + rcu_assign_pointer(ftrace_graph_notrace_hash, new_hash); mutex_unlock(&graph_lock); + + /* Wait till all users are no longer using the old hash */ + synchronize_sched(); + + free_ftrace_hash(old_hash); } if (!ret) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9001460291bb..afbec961eab1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -803,51 +803,49 @@ extern void __trace_graph_return(struct trace_array *tr, unsigned long flags, int pc); #ifdef CONFIG_DYNAMIC_FTRACE -/* TODO: make this variable */ -#define FTRACE_GRAPH_MAX_FUNCS 32 -extern int ftrace_graph_count; -extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; -extern int ftrace_graph_notrace_count; -extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS]; +extern struct ftrace_hash *ftrace_graph_hash; +extern struct ftrace_hash *ftrace_graph_notrace_hash; static inline int ftrace_graph_addr(unsigned long addr) { - int i; - - if (!ftrace_graph_count) - return 1; - - for (i = 0; i < ftrace_graph_count; i++) { - if (addr == ftrace_graph_funcs[i]) { - /* - * If no irqs are to be traced, but a set_graph_function - * is set, and called by an interrupt handler, we still - * want to trace it. - */ - if (in_irq()) - trace_recursion_set(TRACE_IRQ_BIT); - else - trace_recursion_clear(TRACE_IRQ_BIT); - return 1; - } + int ret = 0; + + preempt_disable_notrace(); + + if (ftrace_hash_empty(ftrace_graph_hash)) { + ret = 1; + goto out; } - return 0; + if (ftrace_lookup_ip(ftrace_graph_hash, addr)) { + /* + * If no irqs are to be traced, but a set_graph_function + * is set, and called by an interrupt handler, we still + * want to trace it. + */ + if (in_irq()) + trace_recursion_set(TRACE_IRQ_BIT); + else + trace_recursion_clear(TRACE_IRQ_BIT); + ret = 1; + } + +out: + preempt_enable_notrace(); + return ret; } static inline int ftrace_graph_notrace_addr(unsigned long addr) { - int i; + int ret = 0; - if (!ftrace_graph_notrace_count) - return 0; + preempt_disable_notrace(); - for (i = 0; i < ftrace_graph_notrace_count; i++) { - if (addr == ftrace_graph_notrace_funcs[i]) - return 1; - } + if (ftrace_lookup_ip(ftrace_graph_notrace_hash, addr)) + ret = 1; - return 0; + preempt_enable_notrace(); + return ret; } #else static inline int ftrace_graph_addr(unsigned long addr) -- cgit v1.2.3-58-ga151 From 2b0cce0e190f8f0b37fe8102ae657f5d9eb0976d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 1 Feb 2017 12:19:33 -0500 Subject: tracing: Add ftrace_hash_key() helper function Replace the couple of use cases that has small logic to produce the ftrace function key id with a helper function. No need for duplicate code. Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2d554a02241d..89240f62061c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1185,6 +1185,15 @@ struct ftrace_page { static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; +static __always_inline unsigned long +ftrace_hash_key(struct ftrace_hash *hash, unsigned long ip) +{ + if (hash->size_bits > 0) + return hash_long(ip, hash->size_bits); + + return 0; +} + struct ftrace_func_entry * ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) { @@ -1195,11 +1204,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) if (ftrace_hash_empty(hash)) return NULL; - if (hash->size_bits > 0) - key = hash_long(ip, hash->size_bits); - else - key = 0; - + key = ftrace_hash_key(hash, ip); hhd = &hash->buckets[key]; hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) { @@ -1215,11 +1220,7 @@ static void __add_hash_entry(struct ftrace_hash *hash, struct hlist_head *hhd; unsigned long key; - if (hash->size_bits) - key = hash_long(entry->ip, hash->size_bits); - else - key = 0; - + key = ftrace_hash_key(hash, entry->ip); hhd = &hash->buckets[key]; hlist_add_head(&entry->hlist, hhd); hash->count++; -- cgit v1.2.3-58-ga151 From 2b2c279c814112e15577757ec593aa78465e2e56 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 1 Feb 2017 15:37:07 -0500 Subject: ftrace: Create a slight optimization on searching the ftrace_hash This is a micro-optimization, but as it has to deal with a fast path of the function tracer, these optimizations can be noticed. The ftrace_lookup_ip() returns true if the given ip is found in the hash. If it's not found or the hash is NULL, it returns false. But there's some cases that a NULL hash is a true, and the ftrace_hash_empty() is tested before calling ftrace_lookup_ip() in those cases. But as ftrace_lookup_ip() tests that first, that adds a few extra unneeded instructions in those cases. A new static "always_inlined" function is created that does not perform the hash empty test. This most only be used by callers that do the check first anyway, as an empty or NULL hash could cause a crash if a lookup is performed on it. Also add kernel doc for the ftrace_lookup_ip() main function. Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 89240f62061c..1595df0d7d79 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1194,16 +1194,14 @@ ftrace_hash_key(struct ftrace_hash *hash, unsigned long ip) return 0; } -struct ftrace_func_entry * -ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) +/* Only use this function if ftrace_hash_empty() has already been tested */ +static __always_inline struct ftrace_func_entry * +__ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) { unsigned long key; struct ftrace_func_entry *entry; struct hlist_head *hhd; - if (ftrace_hash_empty(hash)) - return NULL; - key = ftrace_hash_key(hash, ip); hhd = &hash->buckets[key]; @@ -1214,6 +1212,25 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) return NULL; } +/** + * ftrace_lookup_ip - Test to see if an ip exists in an ftrace_hash + * @hash: The hash to look at + * @ip: The instruction pointer to test + * + * Search a given @hash to see if a given instruction pointer (@ip) + * exists in it. + * + * Returns the entry that holds the @ip if found. NULL otherwise. + */ +struct ftrace_func_entry * +ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) +{ + if (ftrace_hash_empty(hash)) + return NULL; + + return __ftrace_lookup_ip(hash, ip); +} + static void __add_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry) { @@ -1463,9 +1480,9 @@ static bool hash_contains_ip(unsigned long ip, * notrace hash is considered not in the notrace hash. */ return (ftrace_hash_empty(hash->filter_hash) || - ftrace_lookup_ip(hash->filter_hash, ip)) && + __ftrace_lookup_ip(hash->filter_hash, ip)) && (ftrace_hash_empty(hash->notrace_hash) || - !ftrace_lookup_ip(hash->notrace_hash, ip)); + !__ftrace_lookup_ip(hash->notrace_hash, ip)); } /* @@ -2877,7 +2894,7 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) /* The function must be in the filter */ if (!ftrace_hash_empty(ops->func_hash->filter_hash) && - !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) + !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) return 0; /* If in notrace hash, we ignore it too */ -- cgit v1.2.3-58-ga151 From 555fc7813eeada1738eca42aa9f9ebafd7dc23e6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 2 Feb 2017 10:15:22 -0500 Subject: ftrace: Replace (void *)1 with a meaningful macro name FTRACE_GRAPH_EMPTY When the set_graph_function or set_graph_notrace contains no records, a banner is displayed of either "#### all functions enabled ####" or "#### all functions disabled ####" respectively. To tell the seq operations to do this, (void *)1 is passed as a return value. Instead of using a hardcoded meaningless variable, define it as a macro. Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1595df0d7d79..a9cfc8713198 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4561,6 +4561,8 @@ enum graph_filter_type { GRAPH_FILTER_FUNCTION, }; +#define FTRACE_GRAPH_EMPTY ((void *)1) + struct ftrace_graph_data { struct ftrace_hash *hash; struct ftrace_func_entry *entry; @@ -4616,7 +4618,7 @@ static void *g_start(struct seq_file *m, loff_t *pos) /* Nothing, tell g_show to print all functions are enabled */ if (ftrace_hash_empty(fgd->hash) && !*pos) - return (void *)1; + return FTRACE_GRAPH_EMPTY; fgd->idx = 0; fgd->entry = NULL; @@ -4635,7 +4637,7 @@ static int g_show(struct seq_file *m, void *v) if (!entry) return 0; - if (entry == (void *)1) { + if (entry == FTRACE_GRAPH_EMPTY) { struct ftrace_graph_data *fgd = m->private; if (fgd->type == GRAPH_FILTER_FUNCTION) -- cgit v1.2.3-58-ga151 From d4ad9a1ccac31a04a32b5e7547b70428830e0218 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 2 Feb 2017 14:03:39 -0500 Subject: ftrace: Reset fgd->hash in ftrace_graph_write() fgd->hash is saved and then freed, but is never reset to either ftrace_graph_hash nor ftrace_graph_notrace_hash. But if multiple writes are performed, then the freed hash could be accessed again. # cd /sys/kernel/debug/tracing # head -1000 available_filter_functions > /tmp/funcs # cat /tmp/funcs > set_graph_function Causes: general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC Modules linked in: [...] CPU: 2 PID: 1337 Comm: cat Not tainted 4.10.0-rc2-test-00010-g6b052e9 #32 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v02.05 05/07/2012 task: ffff880113a12200 task.stack: ffffc90001940000 RIP: 0010:free_ftrace_hash+0x7c/0x160 RSP: 0018:ffffc90001943db0 EFLAGS: 00010246 RAX: 6b6b6b6b6b6b6b6b RBX: 6b6b6b6b6b6b6b6b RCX: 6b6b6b6b6b6b6b6b RDX: 0000000000000002 RSI: 0000000000000001 RDI: ffff8800ce1e1d40 RBP: ffff8800ce1e1d50 R08: 0000000000000000 R09: 0000000000006400 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: ffff8800ce1e1d40 R14: 0000000000004000 R15: 0000000000000001 FS: 00007f9408a07740(0000) GS:ffff88011e500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000aee1f0 CR3: 0000000116bb4000 CR4: 00000000001406e0 Call Trace: ? ftrace_graph_write+0x150/0x190 ? __vfs_write+0x1f6/0x210 ? __audit_syscall_entry+0x17f/0x200 ? rw_verify_area+0xdb/0x210 ? _cond_resched+0x2b/0x50 ? __sb_start_write+0xb4/0x130 ? vfs_write+0x1c8/0x330 ? SyS_write+0x62/0xf0 ? do_syscall_64+0xa3/0x1b0 ? entry_SYSCALL64_slow_path+0x25/0x25 Code: 01 48 85 db 0f 84 92 00 00 00 b8 01 00 00 00 d3 e0 85 c0 7e 3f 83 e8 01 48 8d 6f 10 45 31 e4 4c 8d 34 c5 08 00 00 00 49 8b 45 08 <4a> 8b 34 20 48 85 f6 74 13 48 8b 1e 48 89 ef e8 20 fa ff ff 48 RIP: free_ftrace_hash+0x7c/0x160 RSP: ffffc90001943db0 ---[ end trace 999b48216bf4b393 ]--- Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a9cfc8713198..b7df0dcf8652 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4858,10 +4858,13 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, if (!new_hash) ret = -ENOMEM; - if (fgd->type == GRAPH_FILTER_FUNCTION) + if (fgd->type == GRAPH_FILTER_FUNCTION) { rcu_assign_pointer(ftrace_graph_hash, new_hash); - else + fgd->hash = ftrace_graph_hash; + } else { rcu_assign_pointer(ftrace_graph_notrace_hash, new_hash); + fgd->hash = ftrace_graph_notrace_hash; + } mutex_unlock(&graph_lock); -- cgit v1.2.3-58-ga151 From ae98d27afc3bde5a48f440d905317602a5cfb0d2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 2 Feb 2017 16:59:06 -0500 Subject: ftrace: Have set_graph_functions handle write with RDWR Since reading the set_graph_functions uses seq functions, which sets the file->private_data pointer to a seq_file descriptor. On writes the ftrace_graph_data descriptor is set to file->private_data. But if the file is opened for RDWR, the ftrace_graph_write() will incorrectly use the file->private_data descriptor instead of ((struct seq_file *)file->private_data)->private pointer, and this can crash the kernel. Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b7df0dcf8652..0233c8cb45f4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4842,6 +4842,12 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) return -ENOMEM; + /* Read mode uses seq functions */ + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + fgd = m->private; + } + read = trace_get_user(&parser, ubuf, cnt, ppos); if (read >= 0 && trace_parser_loaded((&parser))) { -- cgit v1.2.3-58-ga151 From 0e684b6578ee463ecb5c9a1cd0c20069f063d9f0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 2 Feb 2017 17:58:18 -0500 Subject: tracing: Reset parser->buffer to allow multiple "puts" trace_parser_put() simply frees the allocated parser buffer. But it does not reset the pointer that was freed. This means that if trace_parser_put() is called on the same parser more than once, it will corrupt the allocation system. Setting parser->buffer to NULL after free allows it to be called more than once without any ill effect. Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d7449783987a..4589b67168fc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1193,6 +1193,7 @@ int trace_parser_get_init(struct trace_parser *parser, int size) void trace_parser_put(struct trace_parser *parser) { kfree(parser->buffer); + parser->buffer = NULL; } /* -- cgit v1.2.3-58-ga151 From 649b988b12ddb9aed16047a3d9bb4d7bfdb47221 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 2 Feb 2017 20:16:29 -0500 Subject: ftrace: Do not hold references of ftrace_graph_{notrace_}hash out of graph_lock The hashs ftrace_graph_hash and ftrace_graph_notrace_hash are modified within the graph_lock being held. Holding a pointer to them and passing them along can lead to a use of a stale pointer (fgd->hash). Move assigning the pointer and its use to within the holding of the lock. Note, it's an rcu_sched protected data, and other instances of referencing them are done with preemption disabled. But the file manipuation code must be protected by the lock. The fgd->hash pointer is set to NULL when the lock is being released. Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0233c8cb45f4..b3a4896ef78a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4616,6 +4616,13 @@ static void *g_start(struct seq_file *m, loff_t *pos) mutex_lock(&graph_lock); + if (fgd->type == GRAPH_FILTER_FUNCTION) + fgd->hash = rcu_dereference_protected(ftrace_graph_hash, + lockdep_is_held(&graph_lock)); + else + fgd->hash = rcu_dereference_protected(ftrace_graph_notrace_hash, + lockdep_is_held(&graph_lock)); + /* Nothing, tell g_show to print all functions are enabled */ if (ftrace_hash_empty(fgd->hash) && !*pos) return FTRACE_GRAPH_EMPTY; @@ -4695,6 +4702,14 @@ __ftrace_graph_open(struct inode *inode, struct file *file, out: fgd->new_hash = new_hash; + + /* + * All uses of fgd->hash must be taken with the graph_lock + * held. The graph_lock is going to be released, so force + * fgd->hash to be reinitialized when it is taken again. + */ + fgd->hash = NULL; + return ret; } @@ -4713,7 +4728,8 @@ ftrace_graph_open(struct inode *inode, struct file *file) mutex_lock(&graph_lock); - fgd->hash = ftrace_graph_hash; + fgd->hash = rcu_dereference_protected(ftrace_graph_hash, + lockdep_is_held(&graph_lock)); fgd->type = GRAPH_FILTER_FUNCTION; fgd->seq_ops = &ftrace_graph_seq_ops; @@ -4740,7 +4756,8 @@ ftrace_graph_notrace_open(struct inode *inode, struct file *file) mutex_lock(&graph_lock); - fgd->hash = ftrace_graph_notrace_hash; + fgd->hash = rcu_dereference_protected(ftrace_graph_notrace_hash, + lockdep_is_held(&graph_lock)); fgd->type = GRAPH_FILTER_NOTRACE; fgd->seq_ops = &ftrace_graph_seq_ops; @@ -4859,17 +4876,18 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, ret = ftrace_graph_set_hash(fgd->new_hash, parser.buffer); - old_hash = fgd->hash; new_hash = __ftrace_hash_move(fgd->new_hash); if (!new_hash) ret = -ENOMEM; if (fgd->type == GRAPH_FILTER_FUNCTION) { + old_hash = rcu_dereference_protected(ftrace_graph_hash, + lockdep_is_held(&graph_lock)); rcu_assign_pointer(ftrace_graph_hash, new_hash); - fgd->hash = ftrace_graph_hash; } else { + old_hash = rcu_dereference_protected(ftrace_graph_notrace_hash, + lockdep_is_held(&graph_lock)); rcu_assign_pointer(ftrace_graph_notrace_hash, new_hash); - fgd->hash = ftrace_graph_notrace_hash; } mutex_unlock(&graph_lock); -- cgit v1.2.3-58-ga151 From e704eff3ff5138a462443dcd64d071165df18782 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 2 Feb 2017 20:34:37 -0500 Subject: ftrace: Have set_graph_function handle multiple functions in one write Currently, only one function can be written to set_graph_function and set_graph_notrace. The last function in the list will have saved, even though other functions will be added then removed. Change the behavior to be the same as set_ftrace_function as to allow multiple functions to be written. If any one fails, none of them will be added. The addition of the functions are done at the end when the file is closed. Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 105 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 64 insertions(+), 41 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b3a4896ef78a..0c0609326391 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4564,12 +4564,13 @@ enum graph_filter_type { #define FTRACE_GRAPH_EMPTY ((void *)1) struct ftrace_graph_data { - struct ftrace_hash *hash; - struct ftrace_func_entry *entry; - int idx; /* for hash table iteration */ - enum graph_filter_type type; - struct ftrace_hash *new_hash; - const struct seq_operations *seq_ops; + struct ftrace_hash *hash; + struct ftrace_func_entry *entry; + int idx; /* for hash table iteration */ + enum graph_filter_type type; + struct ftrace_hash *new_hash; + const struct seq_operations *seq_ops; + struct trace_parser parser; }; static void * @@ -4676,6 +4677,9 @@ __ftrace_graph_open(struct inode *inode, struct file *file, if (file->f_mode & FMODE_WRITE) { const int size_bits = FTRACE_HASH_DEFAULT_BITS; + if (trace_parser_get_init(&fgd->parser, FTRACE_BUFF_MAX)) + return -ENOMEM; + if (file->f_flags & O_TRUNC) new_hash = alloc_ftrace_hash(size_bits); else @@ -4701,6 +4705,9 @@ __ftrace_graph_open(struct inode *inode, struct file *file, file->private_data = fgd; out: + if (ret < 0 && file->f_mode & FMODE_WRITE) + trace_parser_put(&fgd->parser); + fgd->new_hash = new_hash; /* @@ -4773,6 +4780,9 @@ static int ftrace_graph_release(struct inode *inode, struct file *file) { struct ftrace_graph_data *fgd; + struct ftrace_hash *old_hash, *new_hash; + struct trace_parser *parser; + int ret = 0; if (file->f_mode & FMODE_READ) { struct seq_file *m = file->private_data; @@ -4783,10 +4793,50 @@ ftrace_graph_release(struct inode *inode, struct file *file) fgd = file->private_data; } + + if (file->f_mode & FMODE_WRITE) { + + parser = &fgd->parser; + + if (trace_parser_loaded((parser))) { + parser->buffer[parser->idx] = 0; + ret = ftrace_graph_set_hash(fgd->new_hash, + parser->buffer); + } + + trace_parser_put(parser); + + new_hash = __ftrace_hash_move(fgd->new_hash); + if (!new_hash) { + ret = -ENOMEM; + goto out; + } + + mutex_lock(&graph_lock); + + if (fgd->type == GRAPH_FILTER_FUNCTION) { + old_hash = rcu_dereference_protected(ftrace_graph_hash, + lockdep_is_held(&graph_lock)); + rcu_assign_pointer(ftrace_graph_hash, new_hash); + } else { + old_hash = rcu_dereference_protected(ftrace_graph_notrace_hash, + lockdep_is_held(&graph_lock)); + rcu_assign_pointer(ftrace_graph_notrace_hash, new_hash); + } + + mutex_unlock(&graph_lock); + + /* Wait till all users are no longer using the old hash */ + synchronize_sched(); + + free_ftrace_hash(old_hash); + } + + out: kfree(fgd->new_hash); kfree(fgd); - return 0; + return ret; } static int @@ -4848,61 +4898,34 @@ static ssize_t ftrace_graph_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_parser parser; ssize_t read, ret = 0; struct ftrace_graph_data *fgd = file->private_data; - struct ftrace_hash *old_hash, *new_hash; + struct trace_parser *parser; if (!cnt) return 0; - if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) - return -ENOMEM; - /* Read mode uses seq functions */ if (file->f_mode & FMODE_READ) { struct seq_file *m = file->private_data; fgd = m->private; } - read = trace_get_user(&parser, ubuf, cnt, ppos); + parser = &fgd->parser; - if (read >= 0 && trace_parser_loaded((&parser))) { - parser.buffer[parser.idx] = 0; + read = trace_get_user(parser, ubuf, cnt, ppos); - mutex_lock(&graph_lock); + if (read >= 0 && trace_parser_loaded(parser) && + !trace_parser_cont(parser)) { - /* we allow only one expression at a time */ ret = ftrace_graph_set_hash(fgd->new_hash, - parser.buffer); - - new_hash = __ftrace_hash_move(fgd->new_hash); - if (!new_hash) - ret = -ENOMEM; - - if (fgd->type == GRAPH_FILTER_FUNCTION) { - old_hash = rcu_dereference_protected(ftrace_graph_hash, - lockdep_is_held(&graph_lock)); - rcu_assign_pointer(ftrace_graph_hash, new_hash); - } else { - old_hash = rcu_dereference_protected(ftrace_graph_notrace_hash, - lockdep_is_held(&graph_lock)); - rcu_assign_pointer(ftrace_graph_notrace_hash, new_hash); - } - - mutex_unlock(&graph_lock); - - /* Wait till all users are no longer using the old hash */ - synchronize_sched(); - - free_ftrace_hash(old_hash); + parser->buffer); + trace_parser_clear(parser); } if (!ret) ret = read; - trace_parser_put(&parser); - return ret; } -- cgit v1.2.3-58-ga151 From 4c7384131c8d343c0bf79abac3b3e78596d85b10 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 8 Feb 2017 13:36:37 -0500 Subject: tracing: Have COMM event filter key be treated as a string The GLOB operation "~" should be able to work with the COMM filter key in order to trace programs with a glob. For example echo 'COMM ~ "systemd*"' > events/syscalls/filter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index afbec961eab1..d2d068b36341 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1312,7 +1312,8 @@ static inline bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || field->filter_type == FILTER_STATIC_STRING || - field->filter_type == FILTER_PTR_STRING; + field->filter_type == FILTER_PTR_STRING || + field->filter_type == FILTER_COMM; } static inline bool is_function_field(struct ftrace_event_field *field) -- cgit v1.2.3-58-ga151 From 1f9b3546cf4c273b6d809003244d05cf0460a5e9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 9 Feb 2017 17:53:50 -0500 Subject: tracing: Have traceprobe_probes_write() not access userspace unnecessarily The code in traceprobe_probes_write() reads up to 4096 bytes from userpace for each line. If userspace passes in several lines to execute, the code will do a large read for each line, even though, it is highly likely that the first read from userspace received all of the lines at once. I changed the logic to do a single read from userspace, and to only read from userspace again if not all of the read from userspace made it in. I tested this by adding printk()s and writing files that would test -1, ==, and +1 the buffer size, to make sure that there's no overflows and that if a single line is written with +1 the buffer size, that it fails properly. Link: http://lkml.kernel.org/r/20170209180458.5c829ab2@gandalf.local.home Acked-by: Masami Hiramatsu Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 48 ++++++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 8c0553d9afd3..2a06f1fa7001 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -647,7 +647,7 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos, int (*createfn)(int, char **)) { - char *kbuf, *tmp; + char *kbuf, *buf, *tmp; int ret = 0; size_t done = 0; size_t size; @@ -667,27 +667,37 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, goto out; } kbuf[size] = '\0'; - tmp = strchr(kbuf, '\n'); + buf = kbuf; + do { + tmp = strchr(buf, '\n'); + if (tmp) { + *tmp = '\0'; + size = tmp - buf + 1; + } else { + size = strlen(buf); + if (done + size < count) { + if (buf != kbuf) + break; + pr_warn("Line length is too long: Should be less than %d\n", + WRITE_BUFSIZE); + ret = -EINVAL; + goto out; + } + } + done += size; - if (tmp) { - *tmp = '\0'; - size = tmp - kbuf + 1; - } else if (done + size < count) { - pr_warn("Line length is too long: Should be less than %d\n", - WRITE_BUFSIZE); - ret = -EINVAL; - goto out; - } - done += size; - /* Remove comments */ - tmp = strchr(kbuf, '#'); + /* Remove comments */ + tmp = strchr(buf, '#'); - if (tmp) - *tmp = '\0'; + if (tmp) + *tmp = '\0'; - ret = traceprobe_command(kbuf, createfn); - if (ret) - goto out; + ret = traceprobe_command(buf, createfn); + if (ret) + goto out; + buf += size; + + } while (done < count); } ret = done; -- cgit v1.2.3-58-ga151 From 8a58a34ba479d42b3ef28f8ffd9e245a81a7786f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Feb 2017 16:41:15 +0100 Subject: timers: Make flags output in the timer_start tracepoint useful The timer flags in the timer_start trace event contain lots of useful information, but the meaning is not clear in the trace output. Making tools rely on the bit positions is bad as they might change over time. Decode the flags in the print out. Tools can retrieve the bits and their meaning from the trace format file. Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1702101639290.4036@nanos Requested-by: Arjan van de Ven Signed-off-by: Thomas Gleixner Signed-off-by: Steven Rostedt (VMware) --- include/linux/timer.h | 2 ++ include/trace/events/timer.h | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/include/linux/timer.h b/include/linux/timer.h index 51d601f192d4..a17f915f9456 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -66,6 +66,8 @@ struct timer_list { #define TIMER_ARRAYSHIFT 22 #define TIMER_ARRAYMASK 0xFFC00000 +#define TIMER_TRACE_FLAGMASK (TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE) + #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \ .entry = { .next = TIMER_ENTRY_STATIC }, \ .function = (_function), \ diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 1448637616d6..f6d8fea6df77 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -36,6 +36,13 @@ DEFINE_EVENT(timer_class, timer_init, TP_ARGS(timer) ); +#define decode_timer_flags(flags) \ + __print_flags(flags, "|", \ + { TIMER_MIGRATING, "M" }, \ + { TIMER_DEFERRABLE, "D" }, \ + { TIMER_PINNED, "P" }, \ + { TIMER_IRQSAFE, "I" }) + /** * timer_start - called when the timer is started * @timer: pointer to struct timer_list @@ -65,9 +72,12 @@ TRACE_EVENT(timer_start, __entry->flags = flags; ), - TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld] flags=0x%08x", + TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld] cpu=%u idx=%u flags=%s", __entry->timer, __entry->function, __entry->expires, - (long)__entry->expires - __entry->now, __entry->flags) + (long)__entry->expires - __entry->now, + __entry->flags & TIMER_CPUMASK, + __entry->flags >> TIMER_ARRAYSHIFT, + decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK)) ); /** -- cgit v1.2.3-58-ga151 From 8e0f11429aec1119b17cbfcfa2d3194c8e864a26 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Mon, 13 Feb 2017 12:25:17 -0500 Subject: tracing/hwlat: Update old comment about migration The ftrace hwlat does support a cpumask. Link: http://lkml.kernel.org/r/20170213122517.6e211955@redhat.com Signed-off-by: Luiz Capitulino Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 775569ec50d0..bf209d2657ab 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -322,10 +322,7 @@ static void move_to_next_cpu(void) * need to ensure nothing else might be running (and thus preempting). * Obviously this should never be used in production environments. * - * Currently this runs on which ever CPU it was scheduled on, but most - * real-world hardware latency situations occur across several CPUs, - * but we might later generalize this if we find there are any actualy - * systems with alternate SMI delivery or other hardware latencies. + * Executes one loop interaction on each CPU in tracing_cpumask sysfs file. */ static int kthread_fn(void *data) { -- cgit v1.2.3-58-ga151 From 7257634135c247de235f3cdfdaa22f9eb5f054e4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 7 Feb 2017 20:21:28 +0900 Subject: tracing/probe: Show subsystem name in messages Show "trace_probe:", "trace_kprobe:" and "trace_uprobe:" headers for each warning/error/info message. This will help people to notice that kprobe/uprobe events caused those messages. Link: http://lkml.kernel.org/r/148646647813.24658.16705315294927615333.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 1 + kernel/trace/trace_probe.c | 1 + kernel/trace/trace_uprobe.c | 1 + 3 files changed, 3 insertions(+) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index a133ecd741e4..5c9bd0550542 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -16,6 +16,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) "trace_kprobe: " fmt #include #include diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 2a06f1fa7001..11b63328d6fb 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -21,6 +21,7 @@ * Copyright (C) IBM Corporation, 2010-2011 * Author: Srikar Dronamraju */ +#define pr_fmt(fmt) "trace_probe: " fmt #include "trace_probe.h" diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 4f2ba2bb11e0..f4379e772171 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -17,6 +17,7 @@ * Copyright (C) IBM Corporation, 2010-2012 * Author: Srikar Dronamraju */ +#define pr_fmt(fmt) "trace_kprobe: " fmt #include #include -- cgit v1.2.3-58-ga151 From 3821fd35b58dba449bd894014fbf4e1c43c9e951 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 3 Feb 2017 15:42:24 -0500 Subject: jump_label: Reduce the size of struct static_key The static_key->next field goes mostly unused. The field is used for associating module uses with a static key. Most uses of struct static_key define a static key in the core kernel and make use of it entirely within the core kernel, or define the static key in a module and make use of it only from within that module. In fact, of the ~3,000 static keys defined, I found only about 5 or so that did not fit this pattern. Thus, we can remove the static_key->next field entirely and overload the static_key->entries field. That is, when all the static_key uses are contained within the same module, static_key->entries continues to point to those uses. However, if the static_key uses are not contained within the module where the static_key is defined, then we allocate a struct static_key_mod, store a pointer to the uses within that struct static_key_mod, and have the static key point at the static_key_mod. This does incur some extra memory usage when a static_key is used in a module that does not define it, but since there are only a handful of such cases there is a net savings. In order to identify if the static_key->entries pointer contains a struct static_key_mod or a struct jump_entry pointer, bit 1 of static_key->entries is set to 1 if it points to a struct static_key_mod and is 0 if it points to a struct jump_entry. We were already using bit 0 in a similar way to store the initial value of the static_key. This does mean that allocations of struct static_key_mod and that the struct jump_entry tables need to be at least 4-byte aligned in memory. As far as I can tell all arches meet this criteria. For my .config, the patch increased the text by 778 bytes, but reduced the data + bss size by 14912, for a net savings of 14,134 bytes. text data bss dec hex filename 8092427 5016512 790528 13899467 d416cb vmlinux.pre 8093205 5001600 790528 13885333 d3df95 vmlinux.post Link: http://lkml.kernel.org/r/1486154544-4321-1-git-send-email-jbaron@akamai.com Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Joe Perches Signed-off-by: Jason Baron Signed-off-by: Steven Rostedt (VMware) --- Documentation/static-keys.txt | 4 +- include/linux/jump_label.h | 23 ++++--- kernel/jump_label.c | 153 +++++++++++++++++++++++++++++++++++------- 3 files changed, 145 insertions(+), 35 deletions(-) diff --git a/Documentation/static-keys.txt b/Documentation/static-keys.txt index ea8d7b4e53f0..32a25fad0c1b 100644 --- a/Documentation/static-keys.txt +++ b/Documentation/static-keys.txt @@ -155,7 +155,9 @@ or: There are a few functions and macros that architectures must implement in order to take advantage of this optimization. If there is no architecture support, we -simply fall back to a traditional, load, test, and jump sequence. +simply fall back to a traditional, load, test, and jump sequence. Also, the +struct jump_entry table must be at least 4-byte aligned because the +static_key->entry field makes use of the two least significant bits. * select HAVE_ARCH_JUMP_LABEL, see: arch/x86/Kconfig diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index a0547c571800..680c98b2f41c 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -89,11 +89,17 @@ extern bool static_key_initialized; struct static_key { atomic_t enabled; -/* Set lsb bit to 1 if branch is default true, 0 ot */ - struct jump_entry *entries; -#ifdef CONFIG_MODULES - struct static_key_mod *next; -#endif +/* + * bit 0 => 1 if key is initially true + * 0 if initially false + * bit 1 => 1 if points to struct static_key_mod + * 0 if points to struct jump_entry + */ + union { + unsigned long type; + struct jump_entry *entries; + struct static_key_mod *next; + }; }; #else @@ -118,9 +124,10 @@ struct module; #ifdef HAVE_JUMP_LABEL -#define JUMP_TYPE_FALSE 0UL -#define JUMP_TYPE_TRUE 1UL -#define JUMP_TYPE_MASK 1UL +#define JUMP_TYPE_FALSE 0UL +#define JUMP_TYPE_TRUE 1UL +#define JUMP_TYPE_LINKED 2UL +#define JUMP_TYPE_MASK 3UL static __always_inline bool static_key_false(struct static_key *key) { diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 93ad6c1fb9b6..953411f5ba7f 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -229,12 +229,28 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry static inline struct jump_entry *static_key_entries(struct static_key *key) { - return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK); + WARN_ON_ONCE(key->type & JUMP_TYPE_LINKED); + return (struct jump_entry *)(key->type & ~JUMP_TYPE_MASK); } static inline bool static_key_type(struct static_key *key) { - return (unsigned long)key->entries & JUMP_TYPE_MASK; + return key->type & JUMP_TYPE_TRUE; +} + +static inline bool static_key_linked(struct static_key *key) +{ + return key->type & JUMP_TYPE_LINKED; +} + +static inline void static_key_clear_linked(struct static_key *key) +{ + key->type &= ~JUMP_TYPE_LINKED; +} + +static inline void static_key_set_linked(struct static_key *key) +{ + key->type |= JUMP_TYPE_LINKED; } static inline struct static_key *jump_entry_key(struct jump_entry *entry) @@ -247,6 +263,26 @@ static bool jump_entry_branch(struct jump_entry *entry) return (unsigned long)entry->key & 1UL; } +/*** + * A 'struct static_key' uses a union such that it either points directly + * to a table of 'struct jump_entry' or to a linked list of modules which in + * turn point to 'struct jump_entry' tables. + * + * The two lower bits of the pointer are used to keep track of which pointer + * type is in use and to store the initial branch direction, we use an access + * function which preserves these bits. + */ +static void static_key_set_entries(struct static_key *key, + struct jump_entry *entries) +{ + unsigned long type; + + WARN_ON_ONCE((unsigned long)entries & JUMP_TYPE_MASK); + type = key->type & JUMP_TYPE_MASK; + key->entries = entries; + key->type |= type; +} + static enum jump_label_type jump_label_type(struct jump_entry *entry) { struct static_key *key = jump_entry_key(entry); @@ -306,13 +342,7 @@ void __init jump_label_init(void) continue; key = iterk; - /* - * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. - */ - *((unsigned long *)&key->entries) += (unsigned long)iter; -#ifdef CONFIG_MODULES - key->next = NULL; -#endif + static_key_set_entries(key, iter); } static_key_initialized = true; jump_label_unlock(); @@ -336,6 +366,29 @@ struct static_key_mod { struct module *mod; }; +static inline struct static_key_mod *static_key_mod(struct static_key *key) +{ + WARN_ON_ONCE(!(key->type & JUMP_TYPE_LINKED)); + return (struct static_key_mod *)(key->type & ~JUMP_TYPE_MASK); +} + +/*** + * key->type and key->next are the same via union. + * This sets key->next and preserves the type bits. + * + * See additional comments above static_key_set_entries(). + */ +static void static_key_set_mod(struct static_key *key, + struct static_key_mod *mod) +{ + unsigned long type; + + WARN_ON_ONCE((unsigned long)mod & JUMP_TYPE_MASK); + type = key->type & JUMP_TYPE_MASK; + key->next = mod; + key->type |= type; +} + static int __jump_label_mod_text_reserved(void *start, void *end) { struct module *mod; @@ -358,11 +411,23 @@ static void __jump_label_mod_update(struct static_key *key) { struct static_key_mod *mod; - for (mod = key->next; mod; mod = mod->next) { - struct module *m = mod->mod; + for (mod = static_key_mod(key); mod; mod = mod->next) { + struct jump_entry *stop; + struct module *m; + + /* + * NULL if the static_key is defined in a module + * that does not use it + */ + if (!mod->entries) + continue; - __jump_label_update(key, mod->entries, - m->jump_entries + m->num_jump_entries); + m = mod->mod; + if (!m) + stop = __stop___jump_table; + else + stop = m->jump_entries + m->num_jump_entries; + __jump_label_update(key, mod->entries, stop); } } @@ -397,7 +462,7 @@ static int jump_label_add_module(struct module *mod) struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; struct static_key *key = NULL; - struct static_key_mod *jlm; + struct static_key_mod *jlm, *jlm2; /* if the module doesn't have jump label entries, just return */ if (iter_start == iter_stop) @@ -414,20 +479,32 @@ static int jump_label_add_module(struct module *mod) key = iterk; if (within_module(iter->key, mod)) { - /* - * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. - */ - *((unsigned long *)&key->entries) += (unsigned long)iter; - key->next = NULL; + static_key_set_entries(key, iter); continue; } jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm) return -ENOMEM; + if (!static_key_linked(key)) { + jlm2 = kzalloc(sizeof(struct static_key_mod), + GFP_KERNEL); + if (!jlm2) { + kfree(jlm); + return -ENOMEM; + } + preempt_disable(); + jlm2->mod = __module_address((unsigned long)key); + preempt_enable(); + jlm2->entries = static_key_entries(key); + jlm2->next = NULL; + static_key_set_mod(key, jlm2); + static_key_set_linked(key); + } jlm->mod = mod; jlm->entries = iter; - jlm->next = key->next; - key->next = jlm; + jlm->next = static_key_mod(key); + static_key_set_mod(key, jlm); + static_key_set_linked(key); /* Only update if we've changed from our initial state */ if (jump_label_type(iter) != jump_label_init_type(iter)) @@ -454,16 +531,34 @@ static void jump_label_del_module(struct module *mod) if (within_module(iter->key, mod)) continue; + /* No memory during module load */ + if (WARN_ON(!static_key_linked(key))) + continue; + prev = &key->next; - jlm = key->next; + jlm = static_key_mod(key); while (jlm && jlm->mod != mod) { prev = &jlm->next; jlm = jlm->next; } - if (jlm) { + /* No memory during module load */ + if (WARN_ON(!jlm)) + continue; + + if (prev == &key->next) + static_key_set_mod(key, jlm->next); + else *prev = jlm->next; + + kfree(jlm); + + jlm = static_key_mod(key); + /* if only one etry is left, fold it back into the static_key */ + if (jlm->next == NULL) { + static_key_set_entries(key, jlm->entries); + static_key_clear_linked(key); kfree(jlm); } } @@ -492,8 +587,10 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, case MODULE_STATE_COMING: jump_label_lock(); ret = jump_label_add_module(mod); - if (ret) + if (ret) { + WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n"); jump_label_del_module(mod); + } jump_label_unlock(); break; case MODULE_STATE_GOING: @@ -554,11 +651,14 @@ int jump_label_text_reserved(void *start, void *end) static void jump_label_update(struct static_key *key) { struct jump_entry *stop = __stop___jump_table; - struct jump_entry *entry = static_key_entries(key); + struct jump_entry *entry; #ifdef CONFIG_MODULES struct module *mod; - __jump_label_mod_update(key); + if (static_key_linked(key)) { + __jump_label_mod_update(key); + return; + } preempt_disable(); mod = __module_address((unsigned long)key); @@ -566,6 +666,7 @@ static void jump_label_update(struct static_key *key) stop = mod->jump_entries + mod->num_jump_entries; preempt_enable(); #endif + entry = static_key_entries(key); /* if there are no users, entry can be NULL */ if (entry) __jump_label_update(key, entry, stop); -- cgit v1.2.3-58-ga151 From eb583cd484f9a76c3716c0539bd06340943eeff9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 23 Jan 2017 13:24:45 +0100 Subject: tracing: Use modern function declaration We get a lot of harmless warnings about this header file at W=1 level because of an unusual function declaration: kernel/trace/trace.h:766:1: error: 'inline' is not at beginning of declaration [-Werror=old-style-declaration] This moves the inline statement where it normally belongs, avoiding the warning. Link: http://lkml.kernel.org/r/20170123122521.3389010-1-arnd@arndb.de Fixes: 4046bf023b06 ("ftrace: Expose ftrace_hash_empty and ftrace_lookup_ip") Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d2d068b36341..ae1cce91fead 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -763,7 +763,7 @@ struct ftrace_hash { struct ftrace_func_entry * ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip); -static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash) +static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) { return !hash || !hash->count; } -- cgit v1.2.3-58-ga151 From 8f0994bb8cbde5452e58ce0cacdbf6cb58079d01 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 12 Jan 2017 13:55:02 +0000 Subject: tracing: Fix return value check in trace_benchmark_reg() In case of error, the function kthread_run() returns ERR_PTR() and never returns NULL. The NULL test in the return value check should be replaced with IS_ERR(). Link: http://lkml.kernel.org/r/20170112135502.28556-1-weiyj.lk@gmail.com Cc: stable@vger.kernel.org Fixes: 81dc9f0e ("tracing: Add tracepoint benchmark tracepoint") Signed-off-by: Wei Yongjun Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_benchmark.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index e3b488825ae3..e49fbe901cfc 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -175,9 +175,9 @@ int trace_benchmark_reg(void) bm_event_thread = kthread_run(benchmark_event_kthread, NULL, "event_benchmark"); - if (!bm_event_thread) { + if (IS_ERR(bm_event_thread)) { pr_warning("trace benchmark failed to create kernel thread\n"); - return -ENOMEM; + return PTR_ERR(bm_event_thread); } return 0; -- cgit v1.2.3-58-ga151 From bef5da60595f5706232e2502876f8545743b9b41 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 10 Feb 2017 22:21:55 +0900 Subject: tracing/probes: Fix a warning message to show correct maximum length Since tracing/*probe_events will accept a probe definition up to 4096 - 2 ('\n' and '\0') bytes, it must show 4094 instead of 4096 in warning message. Note that there is one possible case of exceed 4094. If user prepare 4096 bytes null-terminated string and syscall write it with the count == 4095, then it can be accepted. However, if user puts a '\n' after that, it must rejected. So IMHO, the warning message should indicate shorter one, since it is safer. Link: http://lkml.kernel.org/r/148673290462.2579.7966778294009665632.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 11b63328d6fb..52478f033f88 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -679,8 +679,9 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, if (done + size < count) { if (buf != kbuf) break; + /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */ pr_warn("Line length is too long: Should be less than %d\n", - WRITE_BUFSIZE); + WRITE_BUFSIZE - 2); ret = -EINVAL; goto out; } -- cgit v1.2.3-58-ga151 From 67d04bb2bcbd3e99f4c4daa58599c90a83ad314a Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 16 Feb 2017 20:10:58 -0800 Subject: tracing: Remove outdated ring buffer comment The comment about ring buffer's organization is outdated and the code sits elsewhere, remove the comment. Link: http://lkml.kernel.org/r/20170217041058.23904-1-joelaf@google.com Cc: Ingo Molnar Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4589b67168fc..54e3b8711aca 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -260,16 +260,8 @@ unsigned long long ns2usecs(u64 nsec) TRACE_ITER_EVENT_FORK /* - * The global_trace is the descriptor that holds the tracing - * buffers for the live tracing. For each CPU, it contains - * a link list of pages that will store trace entries. The - * page descriptor of the pages in the memory is used to hold - * the link list by linking the lru item in the page descriptor - * to each of the pages in the buffer per CPU. - * - * For each active CPU there is a data field that holds the - * pages for the buffer for that CPU. Each CPU has the same number - * of pages allocated for its buffer. + * The global_trace is the descriptor that holds the top-level tracing + * buffers for the live tracing. */ static struct trace_array global_trace = { .trace_flags = TRACE_DEFAULT_FLAGS, -- cgit v1.2.3-58-ga151