1 files changed, 1044 insertions, 2 deletions
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 54d4e508a092..2a0f992ca0be 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -26,6 +26,7 @@
 #include "util/thread.h"
 #include "util/namespaces.h"
 #include "util/util.h"
+#include "util/tsc.h"
 
 #include <internal/lib.h>
 
@@ -35,8 +36,70 @@
 
 #include <linux/list.h>
 #include <linux/string.h>
+#include <linux/zalloc.h>
+#include <linux/hash.h>
 #include <errno.h>
 #include <signal.h>
+#include <inttypes.h>
+
+struct guest_event {
+	struct perf_sample		sample;
+	union perf_event		*event;
+	char				event_buf[PERF_SAMPLE_MAX_SIZE];
+};
+
+struct guest_id {
+	/* hlist_node must be first, see free_hlist() */
+	struct hlist_node		node;
+	u64				id;
+	u64				host_id;
+	u32				vcpu;
+};
+
+struct guest_tid {
+	/* hlist_node must be first, see free_hlist() */
+	struct hlist_node		node;
+	/* Thread ID of QEMU thread */
+	u32				tid;
+	u32				vcpu;
+};
+
+struct guest_vcpu {
+	/* Current host CPU */
+	u32				cpu;
+	/* Thread ID of QEMU thread */
+	u32				tid;
+};
+
+struct guest_session {
+	char				*perf_data_file;
+	u32				machine_pid;
+	u64				time_offset;
+	double				time_scale;
+	struct perf_tool		tool;
+	struct perf_data		data;
+	struct perf_session		*session;
+	char				*tmp_file_name;
+	int				tmp_fd;
+	struct perf_tsc_conversion	host_tc;
+	struct perf_tsc_conversion	guest_tc;
+	bool				copy_kcore_dir;
+	bool				have_tc;
+	bool				fetched;
+	bool				ready;
+	u16				dflt_id_hdr_size;
+	u64				dflt_id;
+	u64				highest_id;
+	/* Array of guest_vcpu */
+	struct guest_vcpu		*vcpu;
+	size_t				vcpu_cnt;
+	/* Hash table for guest_id */
+	struct hlist_head		heads[PERF_EVLIST__HLIST_SIZE];
+	/* Hash table for guest_tid */
+	struct hlist_head		tids[PERF_EVLIST__HLIST_SIZE];
+	/* Place to stash next guest event */
+	struct guest_event		ev;
+};
 
 struct perf_inject {
 	struct perf_tool	tool;
@@ -59,6 +122,7 @@ struct perf_inject {
 	struct itrace_synth_opts itrace_synth_opts;
 	char			event_copy[PERF_SAMPLE_MAX_SIZE];
 	struct perf_file_section secs[HEADER_FEAT_BITS];
+	struct guest_session	guest_session;
 };
 
 struct event_entry {
@@ -698,6 +762,841 @@ found:
 	return perf_event__repipe(tool, event_sw, &sample_sw, machine);
 }
 
+static struct guest_vcpu *guest_session__vcpu(struct guest_session *gs, u32 vcpu)
+{
+	if (realloc_array_as_needed(gs->vcpu, gs->vcpu_cnt, vcpu, NULL))
+		return NULL;
+	return &gs->vcpu[vcpu];
+}
+
+static int guest_session__output_bytes(struct guest_session *gs, void *buf, size_t sz)
+{
+	ssize_t ret = writen(gs->tmp_fd, buf, sz);
+
+	return ret < 0 ? ret : 0;
+}
+
+static int guest_session__repipe(struct perf_tool *tool,
+				 union perf_event *event,
+				 struct perf_sample *sample __maybe_unused,
+				 struct machine *machine __maybe_unused)
+{
+	struct guest_session *gs = container_of(tool, struct guest_session, tool);
+
+	return guest_session__output_bytes(gs, event, event->header.size);
+}
+
+static int guest_session__map_tid(struct guest_session *gs, u32 tid, u32 vcpu)
+{
+	struct guest_tid *guest_tid = zalloc(sizeof(*guest_tid));
+	int hash;
+
+	if (!guest_tid)
+		return -ENOMEM;
+
+	guest_tid->tid = tid;
+	guest_tid->vcpu = vcpu;
+	hash = hash_32(guest_tid->tid, PERF_EVLIST__HLIST_BITS);
+	hlist_add_head(&guest_tid->node, &gs->tids[hash]);
+
+	return 0;
+}
+
+static int host_peek_vm_comms_cb(struct perf_session *session __maybe_unused,
+				 union perf_event *event,
+				 u64 offset __maybe_unused, void *data)
+{
+	struct guest_session *gs = data;
+	unsigned int vcpu;
+	struct guest_vcpu *guest_vcpu;
+	int ret;
+
+	if (event->header.type != PERF_RECORD_COMM ||
+	    event->comm.pid != gs->machine_pid)
+		return 0;
+
+	/*
+	 * QEMU option -name debug-threads=on, causes thread names formatted as
+	 * below, although it is not an ABI. Also libvirt seems to use this by
+	 * default. Here we rely on it to tell us which thread is which VCPU.
+	 */
+	ret = sscanf(event->comm.comm, "CPU %u/KVM", &vcpu);
+	if (ret <= 0)
+		return ret;
+	pr_debug("Found VCPU: tid %u comm %s vcpu %u\n",
+		 event->comm.tid, event->comm.comm, vcpu);
+	if (vcpu > INT_MAX) {
+		pr_err("Invalid VCPU %u\n", vcpu);
+		return -EINVAL;
+	}
+	guest_vcpu = guest_session__vcpu(gs, vcpu);
+	if (!guest_vcpu)
+		return -ENOMEM;
+	if (guest_vcpu->tid && guest_vcpu->tid != event->comm.tid) {
+		pr_err("Fatal error: Two threads found with the same VCPU\n");
+		return -EINVAL;
+	}
+	guest_vcpu->tid = event->comm.tid;
+
+	return guest_session__map_tid(gs, event->comm.tid, vcpu);
+}
+
+static int host_peek_vm_comms(struct perf_session *session, struct guest_session *gs)
+{
+	return perf_session__peek_events(session, session->header.data_offset,
+					 session->header.data_size,
+					 host_peek_vm_comms_cb, gs);
+}
+
+static bool evlist__is_id_used(struct evlist *evlist, u64 id)
+{
+	return evlist__id2sid(evlist, id);
+}
+
+static u64 guest_session__allocate_new_id(struct guest_session *gs, struct evlist *host_evlist)
+{
+	do {
+		gs->highest_id += 1;
+	} while (!gs->highest_id || evlist__is_id_used(host_evlist, gs->highest_id));
+
+	return gs->highest_id;
+}
+
+static int guest_session__map_id(struct guest_session *gs, u64 id, u64 host_id, u32 vcpu)
+{
+	struct guest_id *guest_id = zalloc(sizeof(*guest_id));
+	int hash;
+
+	if (!guest_id)
+		return -ENOMEM;
+
+	guest_id->id = id;
+	guest_id->host_id = host_id;
+	guest_id->vcpu = vcpu;
+	hash = hash_64(guest_id->id, PERF_EVLIST__HLIST_BITS);
+	hlist_add_head(&guest_id->node, &gs->heads[hash]);
+
+	return 0;
+}
+
+static u64 evlist__find_highest_id(struct evlist *evlist)
+{
+	struct evsel *evsel;
+	u64 highest_id = 1;
+
+	evlist__for_each_entry(evlist, evsel) {
+		u32 j;
+
+		for (j = 0; j < evsel->core.ids; j++) {
+			u64 id = evsel->core.id[j];
+
+			if (id > highest_id)
+				highest_id = id;
+		}
+	}
+
+	return highest_id;
+}
+
+static int guest_session__map_ids(struct guest_session *gs, struct evlist *host_evlist)
+{
+	struct evlist *evlist = gs->session->evlist;
+	struct evsel *evsel;
+	int ret;
+
+	evlist__for_each_entry(evlist, evsel) {
+		u32 j;
+
+		for (j = 0; j < evsel->core.ids; j++) {
+			struct perf_sample_id *sid;
+			u64 host_id;
+			u64 id;
+
+			id = evsel->core.id[j];
+			sid = evlist__id2sid(evlist, id);
+			if (!sid || sid->cpu.cpu == -1)
+				continue;
+			host_id = guest_session__allocate_new_id(gs, host_evlist);
+			ret = guest_session__map_id(gs, id, host_id, sid->cpu.cpu);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static struct guest_id *guest_session__lookup_id(struct guest_session *gs, u64 id)
+{
+	struct hlist_head *head;
+	struct guest_id *guest_id;
+	int hash;
+
+	hash = hash_64(id, PERF_EVLIST__HLIST_BITS);
+	head = &gs->heads[hash];
+
+	hlist_for_each_entry(guest_id, head, node)
+		if (guest_id->id == id)
+			return guest_id;
+
+	return NULL;
+}
+
+static int process_attr(struct perf_tool *tool, union perf_event *event,
+			struct perf_sample *sample __maybe_unused,
+			struct machine *machine __maybe_unused)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+
+	return perf_event__process_attr(tool, event, &inject->session->evlist);
+}
+
+static int guest_session__add_attr(struct guest_session *gs, struct evsel *evsel)
+{
+	struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session);
+	struct perf_event_attr attr = evsel->core.attr;
+	u64 *id_array;
+	u32 *vcpu_array;
+	int ret = -ENOMEM;
+	u32 i;
+
+	id_array = calloc(evsel->core.ids, sizeof(*id_array));
+	if (!id_array)
+		return -ENOMEM;
+
+	vcpu_array = calloc(evsel->core.ids, sizeof(*vcpu_array));
+	if (!vcpu_array)
+		goto out;
+
+	for (i = 0; i < evsel->core.ids; i++) {
+		u64 id = evsel->core.id[i];
+		struct guest_id *guest_id = guest_session__lookup_id(gs, id);
+
+		if (!guest_id) {
+			pr_err("Failed to find guest id %"PRIu64"\n", id);
+			ret = -EINVAL;
+			goto out;
+		}
+		id_array[i] = guest_id->host_id;
+		vcpu_array[i] = guest_id->vcpu;
+	}
+
+	attr.sample_type |= PERF_SAMPLE_IDENTIFIER;
+	attr.exclude_host = 1;
+	attr.exclude_guest = 0;
+
+	ret = perf_event__synthesize_attr(&inject->tool, &attr, evsel->core.ids,
+					  id_array, process_attr);
+	if (ret)
+		pr_err("Failed to add guest attr.\n");
+
+	for (i = 0; i < evsel->core.ids; i++) {
+		struct perf_sample_id *sid;
+		u32 vcpu = vcpu_array[i];
+
+		sid = evlist__id2sid(inject->session->evlist, id_array[i]);
+		/* Guest event is per-thread from the host point of view */
+		sid->cpu.cpu = -1;
+		sid->tid = gs->vcpu[vcpu].tid;
+		sid->machine_pid = gs->machine_pid;
+		sid->vcpu.cpu = vcpu;
+	}
+out:
+	free(vcpu_array);
+	free(id_array);
+	return ret;
+}
+
+static int guest_session__add_attrs(struct guest_session *gs)
+{
+	struct evlist *evlist = gs->session->evlist;
+	struct evsel *evsel;
+	int ret;
+
+	evlist__for_each_entry(evlist, evsel) {
+		ret = guest_session__add_attr(gs, evsel);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int synthesize_id_index(struct perf_inject *inject, size_t new_cnt)
+{
+	struct perf_session *session = inject->session;
+	struct evlist *evlist = session->evlist;
+	struct machine *machine = &session->machines.host;
+	size_t from = evlist->core.nr_entries - new_cnt;
+
+	return __perf_event__synthesize_id_index(&inject->tool, perf_event__repipe,
+						 evlist, machine, from);
+}
+
+static struct guest_tid *guest_session__lookup_tid(struct guest_session *gs, u32 tid)
+{
+	struct hlist_head *head;
+	struct guest_tid *guest_tid;
+	int hash;
+
+	hash = hash_32(tid, PERF_EVLIST__HLIST_BITS);
+	head = &gs->tids[hash];
+
+	hlist_for_each_entry(guest_tid, head, node)
+		if (guest_tid->tid == tid)
+			return guest_tid;
+
+	return NULL;
+}
+
+static bool dso__is_in_kernel_space(struct dso *dso)
+{
+	if (dso__is_vdso(dso))
+		return false;
+
+	return dso__is_kcore(dso) ||
+	       dso->kernel ||
+	       is_kernel_module(dso->long_name, PERF_RECORD_MISC_CPUMODE_UNKNOWN);
+}
+
+static u64 evlist__first_id(struct evlist *evlist)
+{
+	struct evsel *evsel;
+
+	evlist__for_each_entry(evlist, evsel) {
+		if (evsel->core.ids)
+			return evsel->core.id[0];
+	}
+	return 0;
+}
+
+static int process_build_id(struct perf_tool *tool,
+			    union perf_event *event,
+			    struct perf_sample *sample __maybe_unused,
+			    struct machine *machine __maybe_unused)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+
+	return perf_event__process_build_id(inject->session, event);
+}
+
+static int synthesize_build_id(struct perf_inject *inject, struct dso *dso, pid_t machine_pid)
+{
+	struct machine *machine = perf_session__findnew_machine(inject->session, machine_pid);
+	u8 cpumode = dso__is_in_kernel_space(dso) ?
+			PERF_RECORD_MISC_GUEST_KERNEL :
+			PERF_RECORD_MISC_GUEST_USER;
+
+	if (!machine)
+		return -ENOMEM;
+
+	dso->hit = 1;
+
+	return perf_event__synthesize_build_id(&inject->tool, dso, cpumode,
+					       process_build_id, machine);
+}
+
+static int guest_session__add_build_ids(struct guest_session *gs)
+{
+	struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session);
+	struct machine *machine = &gs->session->machines.host;
+	struct dso *dso;
+	int ret;
+
+	/* Build IDs will be put in the Build ID feature section */
+	perf_header__set_feat(&inject->session->header, HEADER_BUILD_ID);
+
+	dsos__for_each_with_build_id(dso, &machine->dsos.head) {
+		ret = synthesize_build_id(inject, dso, gs->machine_pid);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int guest_session__ksymbol_event(struct perf_tool *tool,
+					union perf_event *event,
+					struct perf_sample *sample __maybe_unused,
+					struct machine *machine __maybe_unused)
+{
+	struct guest_session *gs = container_of(tool, struct guest_session, tool);
+
+	/* Only support out-of-line i.e. no BPF support */
+	if (event->ksymbol.ksym_type != PERF_RECORD_KSYMBOL_TYPE_OOL)
+		return 0;
+
+	return guest_session__output_bytes(gs, event, event->header.size);
+}
+
+static int guest_session__start(struct guest_session *gs, const char *name, bool force)
+{
+	char tmp_file_name[] = "/tmp/perf-inject-guest_session-XXXXXX";
+	struct perf_session *session;
+	int ret;
+
+	/* Only these events will be injected */
+	gs->tool.mmap		= guest_session__repipe;
+	gs->tool.mmap2		= guest_session__repipe;
+	gs->tool.comm		= guest_session__repipe;
+	gs->tool.fork		= guest_session__repipe;
+	gs->tool.exit		= guest_session__repipe;
+	gs->tool.lost		= guest_session__repipe;
+	gs->tool.context_switch	= guest_session__repipe;
+	gs->tool.ksymbol	= guest_session__ksymbol_event;
+	gs->tool.text_poke	= guest_session__repipe;
+	/*
+	 * Processing a build ID creates a struct dso with that build ID. Later,
+	 * all guest dsos are iterated and the build IDs processed into the host
+	 * session where they will be output to the Build ID feature section
+	 * when the perf.data file header is written.
+	 */
+	gs->tool.build_id	= perf_event__process_build_id;
+	/* Process the id index to know what VCPU an ID belongs to */
+	gs->tool.id_index	= perf_event__process_id_index;
+
+	gs->tool.ordered_events	= true;
+	gs->tool.ordering_requires_timestamps = true;
+
+	gs->data.path	= name;
+	gs->data.force	= force;
+	gs->data.mode	= PERF_DATA_MODE_READ;
+
+	session = perf_session__new(&gs->data, &gs->tool);
+	if (IS_ERR(session))
+		return PTR_ERR(session);
+	gs->session = session;
+
+	/*
+	 * Initial events have zero'd ID samples. Get default ID sample size
+	 * used for removing them.
+	 */
+	gs->dflt_id_hdr_size = session->machines.host.id_hdr_size;
+	/* And default ID for adding back a host-compatible ID sample */
+	gs->dflt_id = evlist__first_id(session->evlist);
+	if (!gs->dflt_id) {
+		pr_err("Guest data has no sample IDs");
+		return -EINVAL;
+	}
+
+	/* Temporary file for guest events */
+	gs->tmp_file_name = strdup(tmp_file_name);
+	if (!gs->tmp_file_name)
+		return -ENOMEM;
+	gs->tmp_fd = mkstemp(gs->tmp_file_name);
+	if (gs->tmp_fd < 0)
+		return -errno;
+
+	if (zstd_init(&gs->session->zstd_data, 0) < 0)
+		pr_warning("Guest session decompression initialization failed.\n");
+
+	/*
+	 * perf does not support processing 2 sessions simultaneously, so output
+	 * guest events to a temporary file.
+	 */
+	ret = perf_session__process_events(gs->session);
+	if (ret)
+		return ret;
+
+	if (lseek(gs->tmp_fd, 0, SEEK_SET))
+		return -errno;
+
+	return 0;
+}
+
+/* Free hlist nodes assuming hlist_node is the first member of hlist entries */
+static void free_hlist(struct hlist_head *heads, size_t hlist_sz)
+{
+	struct hlist_node *pos, *n;
+	size_t i;
+
+	for (i = 0; i < hlist_sz; ++i) {
+		hlist_for_each_safe(pos, n, &heads[i]) {
+			hlist_del(pos);
+			free(pos);
+		}
+	}
+}
+
+static void guest_session__exit(struct guest_session *gs)
+{
+	if (gs->session) {
+		perf_session__delete(gs->session);
+		free_hlist(gs->heads, PERF_EVLIST__HLIST_SIZE);
+		free_hlist(gs->tids, PERF_EVLIST__HLIST_SIZE);
+	}
+	if (gs->tmp_file_name) {
+		if (gs->tmp_fd >= 0)
+			close(gs->tmp_fd);
+		unlink(gs->tmp_file_name);
+		free(gs->tmp_file_name);
+	}
+	free(gs->vcpu);
+	free(gs->perf_data_file);
+}
+
+static void get_tsc_conv(struct perf_tsc_conversion *tc, struct perf_record_time_conv *time_conv)
+{
+	tc->time_shift		= time_conv->time_shift;
+	tc->time_mult		= time_conv->time_mult;
+	tc->time_zero		= time_conv->time_zero;
+	tc->time_cycles		= time_conv->time_cycles;
+	tc->time_mask		= time_conv->time_mask;
+	tc->cap_user_time_zero	= time_conv->cap_user_time_zero;
+	tc->cap_user_time_short	= time_conv->cap_user_time_short;
+}
+
+static void guest_session__get_tc(struct guest_session *gs)
+{
+	struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session);
+
+	get_tsc_conv(&gs->host_tc, &inject->session->time_conv);
+	get_tsc_conv(&gs->guest_tc, &gs->session->time_conv);
+}
+
+static void guest_session__convert_time(struct guest_session *gs, u64 guest_time, u64 *host_time)
+{
+	u64 tsc;
+
+	if (!guest_time) {
+		*host_time = 0;
+		return;
+	}
+
+	if (gs->guest_tc.cap_user_time_zero)
+		tsc = perf_time_to_tsc(guest_time, &gs->guest_tc);
+	else
+		tsc = guest_time;
+
+	/*
+	 * This is the correct order of operations for x86 if the TSC Offset and
+	 * Multiplier values are used.
+	 */
+	tsc -= gs->time_offset;
+	tsc /= gs->time_scale;
+
+	if (gs->host_tc.cap_user_time_zero)
+		*host_time = tsc_to_perf_time(tsc, &gs->host_tc);
+	else
+		*host_time = tsc;
+}
+
+static int guest_session__fetch(struct guest_session *gs)
+{
+	void *buf = gs->ev.event_buf;
+	struct perf_event_header *hdr = buf;
+	size_t hdr_sz = sizeof(*hdr);
+	ssize_t ret;
+
+	ret = readn(gs->tmp_fd, buf, hdr_sz);
+	if (ret < 0)
+		return ret;
+
+	if (!ret) {
+		/* Zero size means EOF */
+		hdr->size = 0;
+		return 0;
+	}
+
+	buf += hdr_sz;
+
+	ret = readn(gs->tmp_fd, buf, hdr->size - hdr_sz);
+	if (ret < 0)
+		return ret;
+
+	gs->ev.event = (union perf_event *)gs->ev.event_buf;
+	gs->ev.sample.time = 0;
+
+	if (hdr->type >= PERF_RECORD_USER_TYPE_START) {
+		pr_err("Unexpected type fetching guest event");
+		return 0;
+	}
+
+	ret = evlist__parse_sample(gs->session->evlist, gs->ev.event, &gs->ev.sample);
+	if (ret) {
+		pr_err("Parse failed fetching guest event");
+		return ret;
+	}
+
+	if (!gs->have_tc) {
+		guest_session__get_tc(gs);
+		gs->have_tc = true;
+	}
+
+	guest_session__convert_time(gs, gs->ev.sample.time, &gs->ev.sample.time);
+
+	return 0;
+}
+
+static int evlist__append_id_sample(struct evlist *evlist, union perf_event *ev,
+				    const struct perf_sample *sample)
+{
+	struct evsel *evsel;
+	void *array;
+	int ret;
+
+	evsel = evlist__id2evsel(evlist, sample->id);
+	array = ev;
+
+	if (!evsel) {
+		pr_err("No evsel for id %"PRIu64"\n", sample->id);
+		return -EINVAL;
+	}
+
+	array += ev->header.size;
+	ret = perf_event__synthesize_id_sample(array, evsel->core.attr.sample_type, sample);
+	if (ret < 0)
+		return ret;
+
+	if (ret & 7) {
+		pr_err("Bad id sample size %d\n", ret);
+		return -EINVAL;
+	}
+
+	ev->header.size += ret;
+
+	return 0;
+}
+
+static int guest_session__inject_events(struct guest_session *gs, u64 timestamp)
+{
+	struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session);
+	int ret;
+
+	if (!gs->ready)
+		return 0;
+
+	while (1) {
+		struct perf_sample *sample;
+		struct guest_id *guest_id;
+		union perf_event *ev;
+		u16 id_hdr_size;
+		u8 cpumode;
+		u64 id;
+
+		if (!gs->fetched) {
+			ret = guest_session__fetch(gs);
+			if (ret)
+				return ret;
+			gs->fetched = true;
+		}
+
+		ev = gs->ev.event;
+		sample = &gs->ev.sample;
+
+		if (!ev->header.size)
+			return 0; /* EOF */
+
+		if (sample->time > timestamp)
+			return 0;
+
+		/* Change cpumode to guest */
+		cpumode = ev->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
+		if (cpumode & PERF_RECORD_MISC_USER)
+			cpumode = PERF_RECORD_MISC_GUEST_USER;
+		else
+			cpumode = PERF_RECORD_MISC_GUEST_KERNEL;
+		ev->header.misc &= ~PERF_RECORD_MISC_CPUMODE_MASK;
+		ev->header.misc |= cpumode;
+
+		id = sample->id;
+		if (!id) {
+			id = gs->dflt_id;
+			id_hdr_size = gs->dflt_id_hdr_size;
+		} else {
+			struct evsel *evsel = evlist__id2evsel(gs->session->evlist, id);
+
+			id_hdr_size = evsel__id_hdr_size(evsel);
+		}
+
+		if (id_hdr_size & 7) {
+			pr_err("Bad id_hdr_size %u\n", id_hdr_size);
+			return -EINVAL;
+		}
+
+		if (ev->header.size & 7) {
+			pr_err("Bad event size %u\n", ev->header.size);
+			return -EINVAL;
+		}
+
+		/* Remove guest id sample */
+		ev->header.size -= id_hdr_size;
+
+		if (ev->header.size & 7) {
+			pr_err("Bad raw event size %u\n", ev->header.size);
+			return -EINVAL;
+		}
+
+		guest_id = guest_session__lookup_id(gs, id);
+		if (!guest_id) {
+			pr_err("Guest event with unknown id %llu\n",
+			       (unsigned long long)id);
+			return -EINVAL;
+		}
+
+		/* Change to host ID to avoid conflicting ID values */
+		sample->id = guest_id->host_id;
+		sample->stream_id = guest_id->host_id;
+
+		if (sample->cpu != (u32)-1) {
+			if (sample->cpu >= gs->vcpu_cnt) {
+				pr_err("Guest event with unknown VCPU %u\n",
+				       sample->cpu);
+				return -EINVAL;
+			}
+			/* Change to host CPU instead of guest VCPU */
+			sample->cpu = gs->vcpu[sample->cpu].cpu;
+		}
+
+		/* New id sample with new ID and CPU */
+		ret = evlist__append_id_sample(inject->session->evlist, ev, sample);
+		if (ret)
+			return ret;
+
+		if (ev->header.size & 7) {
+			pr_err("Bad new event size %u\n", ev->header.size);
+			return -EINVAL;
+		}
+
+		gs->fetched = false;
+
+		ret = output_bytes(inject, ev, ev->header.size);
+		if (ret)
+			return ret;
+	}
+}
+
+static int guest_session__flush_events(struct guest_session *gs)
+{
+	return guest_session__inject_events(gs, -1);
+}
+
+static int host__repipe(struct perf_tool *tool,
+			union perf_event *event,
+			struct perf_sample *sample,
+			struct machine *machine)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+	int ret;
+
+	ret = guest_session__inject_events(&inject->guest_session, sample->time);
+	if (ret)
+		return ret;
+
+	return perf_event__repipe(tool, event, sample, machine);
+}
+
+static int host__finished_init(struct perf_session *session, union perf_event *event)
+{
+	struct perf_inject *inject = container_of(session->tool, struct perf_inject, tool);
+	struct guest_session *gs = &inject->guest_session;
+	int ret;
+
+	/*
+	 * Peek through host COMM events to find QEMU threads and the VCPU they
+	 * are running.
+	 */
+	ret = host_peek_vm_comms(session, gs);
+	if (ret)
+		return ret;
+
+	if (!gs->vcpu_cnt) {
+		pr_err("No VCPU threads found for pid %u\n", gs->machine_pid);
+		return -EINVAL;
+	}
+
+	/*
+	 * Allocate new (unused) host sample IDs and map them to the guest IDs.
+	 */
+	gs->highest_id = evlist__find_highest_id(session->evlist);
+	ret = guest_session__map_ids(gs, session->evlist);
+	if (ret)
+		return ret;
+
+	ret = guest_session__add_attrs(gs);
+	if (ret)
+		return ret;
+
+	ret = synthesize_id_index(inject, gs->session->evlist->core.nr_entries);
+	if (ret) {
+		pr_err("Failed to synthesize id_index\n");
+		return ret;
+	}
+
+	ret = guest_session__add_build_ids(gs);
+	if (ret) {
+		pr_err("Failed to add guest build IDs\n");
+		return ret;
+	}
+
+	gs->ready = true;
+
+	ret = guest_session__inject_events(gs, 0);
+	if (ret)
+		return ret;
+
+	return perf_event__repipe_op2_synth(session, event);
+}
+
+/*
+ * Obey finished-round ordering. The FINISHED_ROUND event is first processed
+ * which flushes host events to file up until the last flush time. Then inject
+ * guest events up to the same time. Finally write out the FINISHED_ROUND event
+ * itself.
+ */
+static int host__finished_round(struct perf_tool *tool,
+				union perf_event *event,
+				struct ordered_events *oe)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+	int ret = perf_event__process_finished_round(tool, event, oe);
+	u64 timestamp = ordered_events__last_flush_time(oe);
+
+	if (ret)
+		return ret;
+
+	ret = guest_session__inject_events(&inject->guest_session, timestamp);
+	if (ret)
+		return ret;
+
+	return perf_event__repipe_oe_synth(tool, event, oe);
+}
+
+static int host__context_switch(struct perf_tool *tool,
+				union perf_event *event,
+				struct perf_sample *sample,
+				struct machine *machine)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+	bool out = event->header.misc & PERF_RECORD_MISC_SWITCH_OUT;
+	struct guest_session *gs = &inject->guest_session;
+	u32 pid = event->context_switch.next_prev_pid;
+	u32 tid = event->context_switch.next_prev_tid;
+	struct guest_tid *guest_tid;
+	u32 vcpu;
+
+	if (out || pid != gs->machine_pid)
+		goto out;
+
+	guest_tid = guest_session__lookup_tid(gs, tid);
+	if (!guest_tid)
+		goto out;
+
+	if (sample->cpu == (u32)-1) {
+		pr_err("Switch event does not have CPU\n");
+		return -EINVAL;
+	}
+
+	vcpu = guest_tid->vcpu;
+	if (vcpu >= gs->vcpu_cnt)
+		return -EINVAL;
+
+	/* Guest is switching in, record which CPU the VCPU is now running on */
+	gs->vcpu[vcpu].cpu = sample->cpu;
+out:
+	return host__repipe(tool, event, sample, machine);
+}
+
 static void sig_handler(int sig __maybe_unused)
 {
 	session_done = 1;
@@ -767,6 +1666,61 @@ static int parse_vm_time_correlation(const struct option *opt, const char *str,
 	return inject->itrace_synth_opts.vm_tm_corr_args ? 0 : -ENOMEM;
 }
 
+static int parse_guest_data(const struct option *opt, const char *str, int unset)
+{
+	struct perf_inject *inject = opt->value;
+	struct guest_session *gs = &inject->guest_session;
+	char *tok;
+	char *s;
+
+	if (unset)
+		return 0;
+
+	if (!str)
+		goto bad_args;
+
+	s = strdup(str);
+	if (!s)
+		return -ENOMEM;
+
+	gs->perf_data_file = strsep(&s, ",");
+	if (!gs->perf_data_file)
+		goto bad_args;
+
+	gs->copy_kcore_dir = has_kcore_dir(gs->perf_data_file);
+	if (gs->copy_kcore_dir)
+		inject->output.is_dir = true;
+
+	tok = strsep(&s, ",");
+	if (!tok)
+		goto bad_args;
+	gs->machine_pid = strtoul(tok, NULL, 0);
+	if (!inject->guest_session.machine_pid)
+		goto bad_args;
+
+	gs->time_scale = 1;
+
+	tok = strsep(&s, ",");
+	if (!tok)
+		goto out;
+	gs->time_offset = strtoull(tok, NULL, 0);
+
+	tok = strsep(&s, ",");
+	if (!tok)
+		goto out;
+	gs->time_scale = strtod(tok, NULL);
+	if (!gs->time_scale)
+		goto bad_args;
+out:
+	return 0;
+
+bad_args:
+	pr_err("--guest-data option requires guest perf.data file name, "
+	       "guest machine PID, and optionally guest timestamp offset, "
+	       "and guest timestamp scale factor, separated by commas.\n");
+	return -1;
+}
+
 static int save_section_info_cb(struct perf_file_section *section,
 				struct perf_header *ph __maybe_unused,
 				int feat, int fd __maybe_unused, void *data)
@@ -809,7 +1763,7 @@ static bool keep_feat(int feat)
 	case HEADER_CPU_PMU_CAPS:
 	case HEADER_CLOCK_DATA:
 	case HEADER_HYBRID_TOPOLOGY:
-	case HEADER_HYBRID_CPU_PMU_CAPS:
+	case HEADER_PMU_CAPS:
 		return true;
 	/* Information that can be updated */
 	case HEADER_BUILD_ID:
@@ -896,6 +1850,22 @@ static int copy_kcore_dir(struct perf_inject *inject)
 	return ret;
 }
 
+static int guest_session__copy_kcore_dir(struct guest_session *gs)
+{
+	struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session);
+	char *cmd;
+	int ret;
+
+	ret = asprintf(&cmd, "cp -r -n %s/kcore_dir %s/kcore_dir__%u >/dev/null 2>&1",
+		       gs->perf_data_file, inject->output.path, gs->machine_pid);
+	if (ret < 0)
+		return ret;
+	pr_debug("%s\n", cmd);
+	ret = system(cmd);
+	free(cmd);
+	return ret;
+}
+
 static int output_fd(struct perf_inject *inject)
 {
 	return inject->in_place_update ? -1 : perf_data__fd(&inject->output);
@@ -904,6 +1874,7 @@ static int output_fd(struct perf_inject *inject)
 static int __cmd_inject(struct perf_inject *inject)
 {
 	int ret = -EINVAL;
+	struct guest_session *gs = &inject->guest_session;
 	struct perf_session *session = inject->session;
 	int fd = output_fd(inject);
 	u64 output_data_offset;
@@ -968,6 +1939,47 @@ static int __cmd_inject(struct perf_inject *inject)
 		output_data_offset = roundup(8192 + session->header.data_offset, 4096);
 		if (inject->strip)
 			strip_init(inject);
+	} else if (gs->perf_data_file) {
+		char *name = gs->perf_data_file;
+
+		/*
+		 * Not strictly necessary, but keep these events in order wrt
+		 * guest events.
+		 */
+		inject->tool.mmap		= host__repipe;
+		inject->tool.mmap2		= host__repipe;
+		inject->tool.comm		= host__repipe;
+		inject->tool.fork		= host__repipe;
+		inject->tool.exit		= host__repipe;
+		inject->tool.lost		= host__repipe;
+		inject->tool.context_switch	= host__repipe;
+		inject->tool.ksymbol		= host__repipe;
+		inject->tool.text_poke		= host__repipe;
+		/*
+		 * Once the host session has initialized, set up sample ID
+		 * mapping and feed in guest attrs, build IDs and initial
+		 * events.
+		 */
+		inject->tool.finished_init	= host__finished_init;
+		/* Obey finished round ordering */
+		inject->tool.finished_round	= host__finished_round,
+		/* Keep track of which CPU a VCPU is runnng on */
+		inject->tool.context_switch	= host__context_switch;
+		/*
+		 * Must order events to be able to obey finished round
+		 * ordering.
+		 */
+		inject->tool.ordered_events	= true;
+		inject->tool.ordering_requires_timestamps = true;
+		/* Set up a separate session to process guest perf.data file */
+		ret = guest_session__start(gs, name, session->data->force);
+		if (ret) {
+			pr_err("Failed to process %s, error %d\n", name, ret);
+			return ret;
+		}
+		/* Allow space in the header for guest attributes */
+		output_data_offset += gs->session->header.data_offset;
+		output_data_offset = roundup(output_data_offset, 4096);
 	}
 
 	if (!inject->itrace_synth_opts.set)
@@ -980,6 +1992,18 @@ static int __cmd_inject(struct perf_inject *inject)
 	if (ret)
 		return ret;
 
+	if (gs->session) {
+		/*
+		 * Remaining guest events have later timestamps. Flush them
+		 * out to file.
+		 */
+		ret = guest_session__flush_events(gs);
+		if (ret) {
+			pr_err("Failed to flush guest events\n");
+			return ret;
+		}
+	}
+
 	if (!inject->is_pipe && !inject->in_place_update) {
 		struct inject_fc inj_fc = {
 			.fc.copy = feat_copy_cb,
@@ -1014,8 +2038,17 @@ static int __cmd_inject(struct perf_inject *inject)
 
 		if (inject->copy_kcore_dir) {
 			ret = copy_kcore_dir(inject);
-			if (ret)
+			if (ret) {
+				pr_err("Failed to copy kcore\n");
 				return ret;
+			}
+		}
+		if (gs->copy_kcore_dir) {
+			ret = guest_session__copy_kcore_dir(gs);
+			if (ret) {
+				pr_err("Failed to copy guest kcore\n");
+				return ret;
+			}
 		}
 	}
 
@@ -1061,6 +2094,7 @@ int cmd_inject(int argc, const char **argv)
 			.stat		= perf_event__repipe_op2_synth,
 			.stat_round	= perf_event__repipe_op2_synth,
 			.feature	= perf_event__repipe_op2_synth,
+			.finished_init	= perf_event__repipe_op2_synth,
 			.compressed	= perf_event__repipe_op4_synth,
 			.auxtrace	= perf_event__repipe_auxtrace,
 		},
@@ -1112,6 +2146,12 @@ int cmd_inject(int argc, const char **argv)
 		OPT_CALLBACK_OPTARG(0, "vm-time-correlation", &inject, NULL, "opts",
 				    "correlate time between VM guests and the host",
 				    parse_vm_time_correlation),
+		OPT_CALLBACK_OPTARG(0, "guest-data", &inject, NULL, "opts",
+				    "inject events from a guest perf.data file",
+				    parse_guest_data),
+		OPT_STRING(0, "guestmount", &symbol_conf.guestmount, "directory",
+			   "guest mount directory under which every guest os"
+			   " instance has a subdir"),
 		OPT_END()
 	};
 	const char * const inject_usage[] = {
@@ -1242,6 +2282,8 @@ int cmd_inject(int argc, const char **argv)
 
 	ret = __cmd_inject(&inject);
 
+	guest_session__exit(&inject.guest_session);
+
 out_delete:
 	zstd_fini(&(inject.session->zstd_data));
 	perf_session__delete(inject.session);