summaryrefslogtreecommitdiff
path: root/tools/perf/util/arm-spe.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-01-18 06:32:11 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2022-01-18 06:32:11 +0200
commit57d17378a4a042401b0c2fe211e5a0e3a276cb3d (patch)
treebe65e51a7b11cccb903b44708b98b3af47477304 /tools/perf/util/arm-spe.c
parentf0033681f0fe8421baf8db125e57fa6157824c2d (diff)
parent9bce13ea88f85344b765abe5d3dabdd0f44dc177 (diff)
Merge tag 'perf-tools-for-v5.17-2022-01-16' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux
Pull perf tool updates from Arnaldo Carvalho de Melo: "New features: - Add 'trace' subcommand for 'perf ftrace', setting the stage for more 'perf ftrace' subcommands. Not using a subcommand yields the previous behaviour of 'perf ftrace'. - Add 'latency' subcommand to 'perf ftrace', that can use the function graph tracer or a BPF optimized one, via the -b/--use-bpf option. E.g.: $ sudo perf ftrace latency -a -T mutex_lock sleep 1 # DURATION | COUNT | GRAPH | 0 - 1 us | 4596 | ######################## | 1 - 2 us | 1680 | ######### | 2 - 4 us | 1106 | ##### | 4 - 8 us | 546 | ## | 8 - 16 us | 562 | ### | 16 - 32 us | 1 | | 32 - 64 us | 0 | | 64 - 128 us | 0 | | 128 - 256 us | 0 | | 256 - 512 us | 0 | | 512 - 1024 us | 0 | | 1 - 2 ms | 0 | | 2 - 4 ms | 0 | | 4 - 8 ms | 0 | | 8 - 16 ms | 0 | | 16 - 32 ms | 0 | | 32 - 64 ms | 0 | | 64 - 128 ms | 0 | | 128 - 256 ms | 0 | | 256 - 512 ms | 0 | | 512 - 1024 ms | 0 | | 1 - ... s | 0 | | The original implementation of this command was in the bcc tool. - Support --cputype option for hybrid events in 'perf stat'. Improvements: - Call chain improvements for ARM64. - No need to do any affinity setup when profiling pids. - Reduce multiplexing with duration_time in 'perf stat' metrics. - Improve error message for uncore events, stating that some event groups are can only be used in system wide (-a) mode. - perf stat metric group leader fixes/improvements, including arch specific changes to better support Intel topdown events. - Probe non-deprecated sysfs path first, i.e. try the path /sys/devices/system/cpu/cpuN/topology/thread_siblings first, then the old /sys/devices/system/cpu/cpuN/topology/core_cpus. - Disable debuginfod by default in 'perf record', to avoid stalls on distros such as Fedora 35. - Use unbuffered output in 'perf bench' when pipe/tee'ing to a file. - Enable ignore_missing_thread in 'perf trace' Fixes: - Avoid TUI crash when navigating in the annotation of recursive functions. - Fix hex dump character output in 'perf script'. - Fix JSON indentation to 4 spaces standard in the ARM vendor event files. - Fix use after free in metric__new(). - Fix IS_ERR_OR_NULL() usage in the perf BPF loader. - Fix up cross-arch register support, i.e. when printing register names take into account the architecture where the perf.data file was collected. - Fix SMT fallback with large core counts. - Don't lower case MetricExpr when parsing JSON files so as not to lose info such as the ":G" event modifier in metrics. perf test: - Add basic stress test for sigtrap handling to 'perf test'. - Fix 'perf test' failures on s/390 - Enable system wide for metricgroups test in 'perf test´. - Use 3 digits for test numbering now we can have more tests. Arch specific: - Add events for Arm Neoverse N2 in the ARM JSON vendor event files - Support PERF_MEM_LVLNUM encodings in powerpc, that came from a single patch series, where I incorrectly merged the kernel bits, that were then reverted after coordination with Michael Ellerman and Stephen Rothwell. - Add ARM SPE total latency as PERF_SAMPLE_WEIGHT. - Update AMD documentation, with info on raw event encoding. - Add support for global and local variants of the "p_stage_cyc" sort key, applicable to perf.data files collected on powerpc. - Remove duplicate and incorrect aux size checks in the ARM CoreSight ETM code. Refactorings: - Add a perf_cpu abstraction to disambiguate CPUs and CPU map indexes, fixing problems along the way. - Document CPU map methods. UAPI sync: - Update arch/x86/lib/mem{cpy,set}_64.S copies used in 'perf bench mem memcpy' - Sync UAPI files with the kernel sources: drm, msr-index, cpufeatures. Build system - Enable warnings through HOSTCFLAGS. - Drop requirement for libstdc++.so for libopencsd check libperf: - Make libperf adopt perf_counts_values__scale() from tools/perf/util/. - Add a stat multiplexing test to libperf" * tag 'perf-tools-for-v5.17-2022-01-16' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux: (115 commits) perf record: Disable debuginfod by default perf evlist: No need to do any affinity setup when profiling pids perf cpumap: Add is_dummy() method perf metric: Fix metric_leader perf cputopo: Fix CPU topology reading on s/390 perf metricgroup: Fix use after free in metric__new() libperf tests: Update a use of the new cpumap API perf arm: Fix off-by-one directory path tools arch x86: Sync the msr-index.h copy with the kernel sources tools headers cpufeatures: Sync with the kernel sources tools headers UAPI: Update tools's copy of drm.h header tools arch: Update arch/x86/lib/mem{cpy,set}_64.S copies used in 'perf bench mem memcpy' perf pmu-events: Don't lower case MetricExpr perf expr: Add debug logging for literals perf tools: Probe non-deprecated sysfs path 1st perf tools: Fix SMT fallback with large core counts perf cpumap: Give CPUs their own type perf stat: Correct first_shadow_cpu to return index perf script: Fix flipped index and cpu perf c2c: Use more intention revealing iterator ...
Diffstat (limited to 'tools/perf/util/arm-spe.c')
-rw-r--r--tools/perf/util/arm-spe.c67
1 files changed, 66 insertions, 1 deletions
diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index fccac06b573a..d2b64e3f588b 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -58,6 +58,8 @@ struct arm_spe {
u8 sample_branch;
u8 sample_remote_access;
u8 sample_memory;
+ u8 sample_instructions;
+ u64 instructions_sample_period;
u64 l1d_miss_id;
u64 l1d_access_id;
@@ -68,6 +70,7 @@ struct arm_spe {
u64 branch_miss_id;
u64 remote_access_id;
u64 memory_id;
+ u64 instructions_id;
u64 kernel_start;
@@ -90,6 +93,7 @@ struct arm_spe_queue {
u64 time;
u64 timestamp;
struct thread *thread;
+ u64 period_instructions;
};
static void arm_spe_dump(struct arm_spe *spe __maybe_unused,
@@ -202,6 +206,7 @@ static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe,
speq->pid = -1;
speq->tid = -1;
speq->cpu = -1;
+ speq->period_instructions = 0;
/* params set */
params.get_trace = arm_spe_get_trace;
@@ -330,6 +335,7 @@ static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq,
sample.addr = record->virt_addr;
sample.phys_addr = record->phys_addr;
sample.data_src = data_src;
+ sample.weight = record->latency;
return arm_spe_deliver_synth_event(spe, speq, event, &sample);
}
@@ -347,6 +353,36 @@ static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,
sample.id = spe_events_id;
sample.stream_id = spe_events_id;
sample.addr = record->to_ip;
+ sample.weight = record->latency;
+
+ return arm_spe_deliver_synth_event(spe, speq, event, &sample);
+}
+
+static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
+ u64 spe_events_id, u64 data_src)
+{
+ struct arm_spe *spe = speq->spe;
+ struct arm_spe_record *record = &speq->decoder->record;
+ union perf_event *event = speq->event_buf;
+ struct perf_sample sample = { .ip = 0, };
+
+ /*
+ * Handles perf instruction sampling period.
+ */
+ speq->period_instructions++;
+ if (speq->period_instructions < spe->instructions_sample_period)
+ return 0;
+ speq->period_instructions = 0;
+
+ arm_spe_prep_sample(spe, speq, event, &sample);
+
+ sample.id = spe_events_id;
+ sample.stream_id = spe_events_id;
+ sample.addr = record->virt_addr;
+ sample.phys_addr = record->phys_addr;
+ sample.data_src = data_src;
+ sample.period = spe->instructions_sample_period;
+ sample.weight = record->latency;
return arm_spe_deliver_synth_event(spe, speq, event, &sample);
}
@@ -480,6 +516,12 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
return err;
}
+ if (spe->sample_instructions) {
+ err = arm_spe__synth_instruction_sample(speq, spe->instructions_id, data_src);
+ if (err)
+ return err;
+ }
+
return 0;
}
@@ -993,7 +1035,8 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
attr.type = PERF_TYPE_HARDWARE;
attr.sample_type = evsel->core.attr.sample_type & PERF_SAMPLE_MASK;
attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID |
- PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC;
+ PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC |
+ PERF_SAMPLE_WEIGHT;
if (spe->timeless_decoding)
attr.sample_type &= ~(u64)PERF_SAMPLE_TIME;
else
@@ -1107,7 +1150,29 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
return err;
spe->memory_id = id;
arm_spe_set_event_name(evlist, id, "memory");
+ id += 1;
+ }
+
+ if (spe->synth_opts.instructions) {
+ if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) {
+ pr_warning("Only instruction-based sampling period is currently supported by Arm SPE.\n");
+ goto synth_instructions_out;
+ }
+ if (spe->synth_opts.period > 1)
+ pr_warning("Arm SPE has a hardware-based sample period.\n"
+ "Additional instruction events will be discarded by --itrace\n");
+
+ spe->sample_instructions = true;
+ attr.config = PERF_COUNT_HW_INSTRUCTIONS;
+ attr.sample_period = spe->synth_opts.period;
+ spe->instructions_sample_period = attr.sample_period;
+ err = arm_spe_synth_event(session, &attr, id);
+ if (err)
+ return err;
+ spe->instructions_id = id;
+ arm_spe_set_event_name(evlist, id, "instructions");
}
+synth_instructions_out:
return 0;
}