38 files changed, 1734 insertions, 166 deletions
diff --git a/tools/perf/Documentation/itrace.txt b/tools/perf/Documentation/itrace.txt
index 0f1005209a2b..2d586fe5e4c5 100644
--- a/tools/perf/Documentation/itrace.txt
+++ b/tools/perf/Documentation/itrace.txt
@@ -20,6 +20,7 @@
 		L	synthesize last branch entries on existing event records
 		s       skip initial number of events
 		q	quicker (less detailed) decoding
+		Z	prefer to ignore timestamps (so-called "timeless" decoding)
 
 	The default is all events i.e. the same as --itrace=ibxwpe,
 	except for perf script where it is --itrace=ce
diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt
index a8eccff21281..91108fe3ad5f 100644
--- a/tools/perf/Documentation/perf-inject.txt
+++ b/tools/perf/Documentation/perf-inject.txt
@@ -68,6 +68,16 @@ include::itrace.txt[]
 --force::
 	Don't complain, do it.
 
+--vm-time-correlation[=OPTIONS]::
+	Some architectures may capture AUX area data which contains timestamps
+	affected by virtualization. This option will update those timestamps
+	in place, to correlate with host timestamps. The in-place update means
+	that an output file is not specified, and instead the input file is
+	modified.  The options are architecture specific, except that they may
+	start with "dry-run" which will cause the file to be processed but
+	without updating it. Currently this option is supported only by
+	Intel PT, refer linkperf:perf-intel-pt[1]
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1],
diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt
index bcf3eca5afbe..e382dbd4ff0a 100644
--- a/tools/perf/Documentation/perf-intel-pt.txt
+++ b/tools/perf/Documentation/perf-intel-pt.txt
@@ -869,6 +869,7 @@ The letters are:
 	L	synthesize last branch entries on existing event records
 	s	skip initial number of events
 	q	quicker (less detailed) decoding
+	Z	prefer to ignore timestamps (so-called "timeless" decoding)
 
 "Instructions" events look like they were recorded by "perf record -e
 instructions".
@@ -1062,6 +1063,10 @@ What *will* be decoded with the qq option:
 
 	- instruction pointer associated with PSB packets
 
+The Z option is equivalent to having recorded a trace without TSC
+(i.e. config term tsc=0). It can be useful to avoid timestamp issues when
+decoding a trace of a virtual machine.
+
 
 dump option
 ~~~~~~~~~~~
@@ -1150,8 +1155,9 @@ include::build-xed.txt[]
 Tracing Virtual Machines
 ------------------------
 
-Currently, only kernel tracing is supported and only with "timeless" decoding
-i.e. no TSC timestamps
+Currently, only kernel tracing is supported and only with either "timeless" decoding
+(i.e. no TSC timestamps) or VM Time Correlation. VM Time Correlation is an extra step
+using 'perf inject' and requires unchanging VMX TSC Offset and no VMX TSC Scaling.
 
 Other limitations and caveats
 
@@ -1162,7 +1168,7 @@ Other limitations and caveats
  Guest VCPU is unknown but may be able to be inferred from the host thread
  Callchains are not supported
 
-Example
+Example using "timeless" decoding
 
 Start VM
 
@@ -1226,6 +1232,107 @@ perf script can be used to provide an instruction trace
            :1440  1440  ffffffffbb74603c clockevents_program_event+0x4c ([guest.kernel.kallsyms])               popq  %rbx
            :1440  1440  ffffffffbb74603d clockevents_program_event+0x4d ([guest.kernel.kallsyms])               popq  %r12
 
+Example using VM Time Correlation
+
+Start VM
+
+ $ sudo virsh start kubuntu20.04
+ Domain kubuntu20.04 started
+
+Mount the guest file system.  Note sshfs needs -o direct_io to enable reading of proc files.  root access is needed to read /proc/kcore.
+
+ $ mkdir -p vm0
+ $ sshfs -o direct_io root@vm0:/ vm0
+
+Copy the guest /proc/kallsyms, /proc/modules and /proc/kcore
+
+ $ perf buildid-cache -v --kcore vm0/proc/kcore
+ same kcore found in /home/user/.debug/[kernel.kcore]/cc9c55a98c5e4ec0aeda69302554aabed5cd6491/2021021312450777
+ $ KALLSYMS=/home/user/.debug/\[kernel.kcore\]/cc9c55a98c5e4ec0aeda69302554aabed5cd6491/2021021312450777/kallsyms
+
+Find the VM process
+
+ $ ps -eLl | grep 'KVM\|PID'
+ F S   UID     PID    PPID     LWP  C PRI  NI ADDR SZ WCHAN  TTY          TIME CMD
+ 3 S 64055   16998       1   17005 13  80   0 - 1818189 -    ?        00:00:16 CPU 0/KVM
+ 3 S 64055   16998       1   17006  4  80   0 - 1818189 -    ?        00:00:05 CPU 1/KVM
+ 3 S 64055   16998       1   17007  3  80   0 - 1818189 -    ?        00:00:04 CPU 2/KVM
+ 3 S 64055   16998       1   17008  4  80   0 - 1818189 -    ?        00:00:05 CPU 3/KVM
+
+Start an open-ended perf record, tracing the VM process, do something on the VM, and then ctrl-C to stop.
+IPC can be determined, hence cyc=1 can be added.
+Only kernel decoding is supported, so 'k' must be specified.
+Intel PT traces both the host and the guest so --guest and --host need to be specified.
+
+ $ sudo perf kvm --guest --host --guestkallsyms $KALLSYMS record --kcore -e intel_pt/cyc=1/k -p 16998
+ ^C[ perf record: Woken up 1 times to write data ]
+ [ perf record: Captured and wrote 9.041 MB perf.data.kvm ]
+
+Now 'perf inject' can be used to determine the VMX TCS Offset. Note, Intel PT TSC packets are
+only 7-bytes, so the TSC Offset might differ from the actual value in the 8th byte. That will
+have no effect i.e. the resulting timestamps will be correct anyway.
+
+ $ perf inject -i perf.data.kvm --vm-time-correlation=dry-run
+ ERROR: Unknown TSC Offset for VMCS 0x1bff6a
+ VMCS: 0x1bff6a  TSC Offset 0xffffe42722c64c41
+ ERROR: Unknown TSC Offset for VMCS 0x1cbc08
+ VMCS: 0x1cbc08  TSC Offset 0xffffe42722c64c41
+ ERROR: Unknown TSC Offset for VMCS 0x1c3ce8
+ VMCS: 0x1c3ce8  TSC Offset 0xffffe42722c64c41
+ ERROR: Unknown TSC Offset for VMCS 0x1cbce9
+ VMCS: 0x1cbce9  TSC Offset 0xffffe42722c64c41
+
+Each virtual CPU has a different Virtual Machine Control Structure (VMCS)
+shown above with the calculated TSC Offset. For an unchanging TSC Offset
+they should all be the same for the same virtual machine.
+
+Now that the TSC Offset is known, it can be provided to 'perf inject'
+
+ $ perf inject -i perf.data.kvm --vm-time-correlation="dry-run 0xffffe42722c64c41"
+
+Note the options for 'perf inject' --vm-time-correlation are:
+
+ [ dry-run ] [ <TSC Offset> [ : <VMCS> [ , <VMCS> ]... ]  ]...
+
+So it is possible to specify different TSC Offsets for different VMCS.
+The option "dry-run" will cause the file to be processed but without updating it.
+Note it is also possible to get a intel_pt.log file by adding option --itrace=d
+
+There were no errors so, do it for real
+
+ $ perf inject -i perf.data.kvm --vm-time-correlation=0xffffe42722c64c41 --force
+
+'perf script' can be used to see if there are any decoder errors
+
+ $ perf script -i perf.data.kvm --guestkallsyms $KALLSYMS --itrace=e-o
+
+There were none.
+
+'perf script' can be used to provide an instruction trace showing timestamps
+
+ $ perf script -i perf.data.kvm --guestkallsyms $KALLSYMS --insn-trace --xed -F+ipc | grep -C10 vmresume | head -21
+       CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133cdd __vmx_vcpu_run+0x3d ([kernel.kallsyms])                 movq  0x48(%rax), %r9
+       CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133ce1 __vmx_vcpu_run+0x41 ([kernel.kallsyms])                 movq  0x50(%rax), %r10
+       CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133ce5 __vmx_vcpu_run+0x45 ([kernel.kallsyms])                 movq  0x58(%rax), %r11
+       CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133ce9 __vmx_vcpu_run+0x49 ([kernel.kallsyms])                 movq  0x60(%rax), %r12
+       CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133ced __vmx_vcpu_run+0x4d ([kernel.kallsyms])                 movq  0x68(%rax), %r13
+       CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133cf1 __vmx_vcpu_run+0x51 ([kernel.kallsyms])                 movq  0x70(%rax), %r14
+       CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133cf5 __vmx_vcpu_run+0x55 ([kernel.kallsyms])                 movq  0x78(%rax), %r15
+       CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133cf9 __vmx_vcpu_run+0x59 ([kernel.kallsyms])                 movq  (%rax), %rax
+       CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133cfc __vmx_vcpu_run+0x5c ([kernel.kallsyms])                 callq  0xffffffff82133c40
+       CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133c40 vmx_vmenter+0x0 ([kernel.kallsyms])             jz 0xffffffff82133c46
+       CPU 1/KVM 17006 [001] 11500.262866075:  ffffffff82133c42 vmx_vmenter+0x2 ([kernel.kallsyms])             vmresume         IPC: 0.05 (40/769)
+          :17006 17006 [001] 11500.262869216:  ffffffff82200cb0 asm_sysvec_apic_timer_interrupt+0x0 ([guest.kernel.kallsyms])           clac
+          :17006 17006 [001] 11500.262869216:  ffffffff82200cb3 asm_sysvec_apic_timer_interrupt+0x3 ([guest.kernel.kallsyms])           pushq  $0xffffffffffffffff
+          :17006 17006 [001] 11500.262869216:  ffffffff82200cb5 asm_sysvec_apic_timer_interrupt+0x5 ([guest.kernel.kallsyms])           callq  0xffffffff82201160
+          :17006 17006 [001] 11500.262869216:  ffffffff82201160 error_entry+0x0 ([guest.kernel.kallsyms])               cld
+          :17006 17006 [001] 11500.262869216:  ffffffff82201161 error_entry+0x1 ([guest.kernel.kallsyms])               pushq  %rsi
+          :17006 17006 [001] 11500.262869216:  ffffffff82201162 error_entry+0x2 ([guest.kernel.kallsyms])               movq  0x8(%rsp), %rsi
+          :17006 17006 [001] 11500.262869216:  ffffffff82201167 error_entry+0x7 ([guest.kernel.kallsyms])               movq  %rdi, 0x8(%rsp)
+          :17006 17006 [001] 11500.262869216:  ffffffff8220116c error_entry+0xc ([guest.kernel.kallsyms])               pushq  %rdx
+          :17006 17006 [001] 11500.262869216:  ffffffff8220116d error_entry+0xd ([guest.kernel.kallsyms])               pushq  %rcx
+          :17006 17006 [001] 11500.262869216:  ffffffff8220116e error_entry+0xe ([guest.kernel.kallsyms])               pushq  %rax
+
 
 
 SEE ALSO
diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt
index 9ee96640744e..e6ff8c898ada 100644
--- a/tools/perf/Documentation/perf.data-file-format.txt
+++ b/tools/perf/Documentation/perf.data-file-format.txt
@@ -402,6 +402,39 @@ struct {
 	u64 clockid_time_ns;
 };
 
+	HEADER_HYBRID_TOPOLOGY = 30,
+
+Indicate the hybrid CPUs. The format of data is as below.
+
+struct {
+	u32 nr;
+	struct {
+		char pmu_name[];
+		char cpus[];
+	} [nr]; /* Variable length records */
+};
+
+Example:
+  hybrid cpu system:
+  cpu_core cpu list : 0-15
+  cpu_atom cpu list : 16-23
+
+	HEADER_HYBRID_CPU_PMU_CAPS = 31,
+
+	A list of hybrid CPU PMU capabilities.
+
+struct {
+	u32 nr_pmu;
+	struct {
+		u32 nr_cpu_pmu_caps;
+		{
+			char	name[];
+			char	value[];
+		} [nr_cpu_pmu_caps];
+		char pmu_name[];
+	} [nr_pmu];
+};
+
 	other bits are reserved and should ignored for now
 	HEADER_FEAT_BITS	= 256,
 
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 406a9519145e..829e0498bd51 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -635,7 +635,7 @@ endif
 ifdef BUILD_BPF_SKEL
   $(call feature_check,clang-bpf-co-re)
   ifeq ($(feature-clang-bpf-co-re), 0)
-    dummy := $(error Error: clang too old. Please install recent clang)
+    dummy := $(error Error: clang too old/not installed. Please install recent clang to build with BUILD_BPF_SKEL)
   endif
   $(call detected,CONFIG_PERF_BPF_SKEL)
   CFLAGS += -DHAVE_BPF_SKEL
diff --git a/tools/perf/arch/arm/include/arch-tests.h b/tools/perf/arch/arm/include/arch-tests.h
index 90ec4c8cb880..c62538052404 100644
--- a/tools/perf/arch/arm/include/arch-tests.h
+++ b/tools/perf/arch/arm/include/arch-tests.h
@@ -2,11 +2,6 @@
 #ifndef ARCH_TESTS_H
 #define ARCH_TESTS_H
 
-#ifdef HAVE_DWARF_UNWIND_SUPPORT
-struct thread;
-struct perf_sample;
-#endif
-
 extern struct test arch_tests[];
 
 #endif
diff --git a/tools/perf/arch/arm64/include/arch-tests.h b/tools/perf/arch/arm64/include/arch-tests.h
index 90ec4c8cb880..c62538052404 100644
--- a/tools/perf/arch/arm64/include/arch-tests.h
+++ b/tools/perf/arch/arm64/include/arch-tests.h
@@ -2,11 +2,6 @@
 #ifndef ARCH_TESTS_H
 #define ARCH_TESTS_H
 
-#ifdef HAVE_DWARF_UNWIND_SUPPORT
-struct thread;
-struct perf_sample;
-#endif
-
 extern struct test arch_tests[];
 
 #endif
diff --git a/tools/perf/arch/powerpc/include/arch-tests.h b/tools/perf/arch/powerpc/include/arch-tests.h
index 1c7be75cbc78..c62538052404 100644
--- a/tools/perf/arch/powerpc/include/arch-tests.h
+++ b/tools/perf/arch/powerpc/include/arch-tests.h
@@ -2,13 +2,6 @@
 #ifndef ARCH_TESTS_H
 #define ARCH_TESTS_H
 
-#ifdef HAVE_DWARF_UNWIND_SUPPORT
-struct thread;
-struct perf_sample;
-int test__arch_unwind_sample(struct perf_sample *sample,
-			     struct thread *thread);
-#endif
-
 extern struct test arch_tests[];
 
 #endif
diff --git a/tools/perf/arch/powerpc/tests/dwarf-unwind.c b/tools/perf/arch/powerpc/tests/dwarf-unwind.c
index 8efd9ed9e9db..c9cb4b059392 100644
--- a/tools/perf/arch/powerpc/tests/dwarf-unwind.c
+++ b/tools/perf/arch/powerpc/tests/dwarf-unwind.c
@@ -7,7 +7,6 @@
 #include "event.h"
 #include "debug.h"
 #include "tests/tests.h"
-#include "arch-tests.h"
 
 #define STACK_SIZE 8192
 
diff --git a/tools/perf/arch/x86/include/arch-tests.h b/tools/perf/arch/x86/include/arch-tests.h
index 0e20f3dc69f3..9599e7a3f1af 100644
--- a/tools/perf/arch/x86/include/arch-tests.h
+++ b/tools/perf/arch/x86/include/arch-tests.h
@@ -2,23 +2,15 @@
 #ifndef ARCH_TESTS_H
 #define ARCH_TESTS_H
 
-#include <linux/compiler.h>
 struct test;
 
 /* Tests */
-int test__rdpmc(struct test *test __maybe_unused, int subtest);
-int test__insn_x86(struct test *test __maybe_unused, int subtest);
+int test__rdpmc(struct test *test, int subtest);
+int test__insn_x86(struct test *test, int subtest);
 int test__intel_pt_pkt_decoder(struct test *test, int subtest);
 int test__bp_modify(struct test *test, int subtest);
 int test__x86_sample_parsing(struct test *test, int subtest);
 
-#ifdef HAVE_DWARF_UNWIND_SUPPORT
-struct thread;
-struct perf_sample;
-int test__arch_unwind_sample(struct perf_sample *sample,
-			     struct thread *thread);
-#endif
-
 extern struct test arch_tests[];
 
 #endif
diff --git a/tools/perf/arch/x86/tests/dwarf-unwind.c b/tools/perf/arch/x86/tests/dwarf-unwind.c
index 478078fb0f22..a54dea7c112f 100644
--- a/tools/perf/arch/x86/tests/dwarf-unwind.c
+++ b/tools/perf/arch/x86/tests/dwarf-unwind.c
@@ -7,7 +7,6 @@
 #include "event.h"
 #include "debug.h"
 #include "tests/tests.h"
-#include "arch-tests.h"
 
 #define STACK_SIZE 8192
 
diff --git a/tools/perf/arch/x86/util/kvm-stat.c b/tools/perf/arch/x86/util/kvm-stat.c
index 072920475b65..c5dd54f6ef5e 100644
--- a/tools/perf/arch/x86/util/kvm-stat.c
+++ b/tools/perf/arch/x86/util/kvm-stat.c
@@ -133,11 +133,56 @@ static struct kvm_events_ops ioport_events = {
 	.name = "IO Port Access"
 };
 
+ /* The time of emulation msr is from kvm_msr to kvm_entry. */
+static void msr_event_get_key(struct evsel *evsel,
+				 struct perf_sample *sample,
+				 struct event_key *key)
+{
+	key->key  = evsel__intval(evsel, sample, "ecx");
+	key->info = evsel__intval(evsel, sample, "write");
+}
+
+static bool msr_event_begin(struct evsel *evsel,
+			       struct perf_sample *sample,
+			       struct event_key *key)
+{
+	if (!strcmp(evsel->name, "kvm:kvm_msr")) {
+		msr_event_get_key(evsel, sample, key);
+		return true;
+	}
+
+	return false;
+}
+
+static bool msr_event_end(struct evsel *evsel,
+			     struct perf_sample *sample __maybe_unused,
+			     struct event_key *key __maybe_unused)
+{
+	return kvm_entry_event(evsel);
+}
+
+static void msr_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
+				    struct event_key *key,
+				    char *decode)
+{
+	scnprintf(decode, decode_str_len, "%#llx:%s",
+		  (unsigned long long)key->key,
+		  key->info ? "W" : "R");
+}
+
+static struct kvm_events_ops msr_events = {
+	.is_begin_event = msr_event_begin,
+	.is_end_event = msr_event_end,
+	.decode_key = msr_event_decode_key,
+	.name = "MSR Access"
+};
+
 const char *kvm_events_tp[] = {
 	"kvm:kvm_entry",
 	"kvm:kvm_exit",
 	"kvm:kvm_mmio",
 	"kvm:kvm_pio",
+	"kvm:kvm_msr",
 	NULL,
 };
 
@@ -145,6 +190,7 @@ struct kvm_reg_events_ops kvm_reg_events_ops[] = {
 	{ .name = "vmexit", .ops = &exit_events },
 	{ .name = "mmio", .ops = &mmio_events },
 	{ .name = "ioport", .ops = &ioport_events },
+	{ .name = "msr", .ops = &msr_events },
 	{ NULL, NULL },
 };
 
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index ddccc0eb7390..102cafb0c0b3 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -31,6 +31,7 @@
 #include <uapi/linux/mman.h> /* To get things like MAP_HUGETLB even on older libc headers */
 
 #include <linux/list.h>
+#include <linux/string.h>
 #include <errno.h>
 #include <signal.h>
 
@@ -43,6 +44,8 @@ struct perf_inject {
 	bool			have_auxtrace;
 	bool			strip;
 	bool			jit_mode;
+	bool			in_place_update;
+	bool			in_place_update_dry_run;
 	const char		*input_name;
 	struct perf_data	output;
 	u64			bytes_written;
@@ -696,12 +699,42 @@ static void strip_init(struct perf_inject *inject)
 		evsel->handler = drop_sample;
 }
 
+static int parse_vm_time_correlation(const struct option *opt, const char *str, int unset)
+{
+	struct perf_inject *inject = opt->value;
+	const char *args;
+	char *dry_run;
+
+	if (unset)
+		return 0;
+
+	inject->itrace_synth_opts.set = true;
+	inject->itrace_synth_opts.vm_time_correlation = true;
+	inject->in_place_update = true;
+
+	if (!str)
+		return 0;
+
+	dry_run = skip_spaces(str);
+	if (!strncmp(dry_run, "dry-run", strlen("dry-run"))) {
+		inject->itrace_synth_opts.vm_tm_corr_dry_run = true;
+		inject->in_place_update_dry_run = true;
+		args = dry_run + strlen("dry-run");
+	} else {
+		args = str;
+	}
+
+	inject->itrace_synth_opts.vm_tm_corr_args = strdup(args);
+
+	return inject->itrace_synth_opts.vm_tm_corr_args ? 0 : -ENOMEM;
+}
+
 static int __cmd_inject(struct perf_inject *inject)
 {
 	int ret = -EINVAL;
 	struct perf_session *session = inject->session;
 	struct perf_data *data_out = &inject->output;
-	int fd = perf_data__fd(data_out);
+	int fd = inject->in_place_update ? -1 : perf_data__fd(data_out);
 	u64 output_data_offset;
 
 	signal(SIGINT, sig_handler);
@@ -737,6 +770,15 @@ static int __cmd_inject(struct perf_inject *inject)
 			else if (!strncmp(name, "sched:sched_stat_", 17))
 				evsel->handler = perf_inject__sched_stat;
 		}
+	} else if (inject->itrace_synth_opts.vm_time_correlation) {
+		session->itrace_synth_opts = &inject->itrace_synth_opts;
+		memset(&inject->tool, 0, sizeof(inject->tool));
+		inject->tool.id_index	    = perf_event__process_id_index;
+		inject->tool.auxtrace_info  = perf_event__process_auxtrace_info;
+		inject->tool.auxtrace	    = perf_event__process_auxtrace;
+		inject->tool.auxtrace_error = perf_event__process_auxtrace_error;
+		inject->tool.ordered_events = true;
+		inject->tool.ordering_requires_timestamps = true;
 	} else if (inject->itrace_synth_opts.set) {
 		session->itrace_synth_opts = &inject->itrace_synth_opts;
 		inject->itrace_synth_opts.inject = true;
@@ -759,14 +801,14 @@ static int __cmd_inject(struct perf_inject *inject)
 	if (!inject->itrace_synth_opts.set)
 		auxtrace_index__free(&session->auxtrace_index);
 
-	if (!data_out->is_pipe)
+	if (!data_out->is_pipe && !inject->in_place_update)
 		lseek(fd, output_data_offset, SEEK_SET);
 
 	ret = perf_session__process_events(session);
 	if (ret)
 		return ret;
 
-	if (!data_out->is_pipe) {
+	if (!data_out->is_pipe && !inject->in_place_update) {
 		if (inject->build_ids)
 			perf_header__set_feat(&session->header,
 					      HEADER_BUILD_ID);
@@ -878,6 +920,9 @@ int cmd_inject(int argc, const char **argv)
 				    itrace_parse_synth_opts),
 		OPT_BOOLEAN(0, "strip", &inject.strip,
 			    "strip non-synthesized events (use with --itrace)"),
+		OPT_CALLBACK_OPTARG(0, "vm-time-correlation", &inject, NULL, "opts",
+				    "correlate time between VM guests and the host",
+				    parse_vm_time_correlation),
 		OPT_END()
 	};
 	const char * const inject_usage[] = {
@@ -900,7 +945,23 @@ int cmd_inject(int argc, const char **argv)
 		return -1;
 	}
 
-	if (perf_data__open(&inject.output)) {
+	if (inject.in_place_update) {
+		if (!strcmp(inject.input_name, "-")) {
+			pr_err("Input file name required for in-place updating\n");
+			return -1;
+		}
+		if (strcmp(inject.output.path, "-")) {
+			pr_err("Output file name must not be specified for in-place updating\n");
+			return -1;
+		}
+		if (!data.force && !inject.in_place_update_dry_run) {
+			pr_err("The input file would be updated in place, "
+				"the --force option is required.\n");
+			return -1;
+		}
+		if (!inject.in_place_update_dry_run)
+			data.in_place_update = true;
+	} else if (perf_data__open(&inject.output)) {
 		perror("failed to create output file");
 		return -1;
 	}
@@ -950,5 +1011,6 @@ int cmd_inject(int argc, const char **argv)
 out_delete:
 	zstd_fini(&(inject.session->zstd_data));
 	perf_session__delete(inject.session);
+	free(inject.itrace_synth_opts.vm_tm_corr_args);
 	return ret;
 }
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 3337b5f93336..bc3dd379eb67 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -969,6 +969,15 @@ out:
 	return rc;
 }
 
+static void set_timestamp_boundary(struct record *rec, u64 sample_time)
+{
+	if (rec->evlist->first_sample_time == 0)
+		rec->evlist->first_sample_time = sample_time;
+
+	if (sample_time)
+		rec->evlist->last_sample_time = sample_time;
+}
+
 static int process_sample_event(struct perf_tool *tool,
 				union perf_event *event,
 				struct perf_sample *sample,
@@ -977,10 +986,7 @@ static int process_sample_event(struct perf_tool *tool,
 {
 	struct record *rec = container_of(tool, struct record, tool);
 
-	if (rec->evlist->first_sample_time == 0)
-		rec->evlist->first_sample_time = sample->time;
-
-	rec->evlist->last_sample_time = sample->time;
+	set_timestamp_boundary(rec, sample->time);
 
 	if (rec->buildid_all)
 		return 0;
@@ -2402,6 +2408,17 @@ static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *eve
 	return perf_event__process_mmap2(tool, event, sample, machine);
 }
 
+static int process_timestamp_boundary(struct perf_tool *tool,
+				      union perf_event *event __maybe_unused,
+				      struct perf_sample *sample,
+				      struct machine *machine __maybe_unused)
+{
+	struct record *rec = container_of(tool, struct record, tool);
+
+	set_timestamp_boundary(rec, sample->time);
+	return 0;
+}
+
 /*
  * XXX Ideally would be local to cmd_record() and passed to a record__new
  * because we need to have access to it in record__exit, that is called
@@ -2436,6 +2453,8 @@ static struct record record = {
 		.namespaces	= perf_event__process_namespaces,
 		.mmap		= build_id__process_mmap,
 		.mmap2		= build_id__process_mmap2,
+		.itrace_start	= process_timestamp_boundary,
+		.aux		= process_timestamp_boundary,
 		.ordered_events	= true,
 	},
 };
diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c
index 83638097c3bc..a288035eb362 100644
--- a/tools/perf/tests/dwarf-unwind.c
+++ b/tools/perf/tests/dwarf-unwind.c
@@ -17,10 +17,6 @@
 #include "callchain.h"
 #include "util/synthetic-events.h"
 
-#if defined (__x86_64__) || defined (__i386__) || defined (__powerpc__)
-#include "arch-tests.h"
-#endif
-
 /* For bsearch. We try to unwind functions in shared object. */
 #include <stdlib.h>
 
diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index 94bd5d215d94..da013e90a945 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -84,9 +84,11 @@ make_no_libaudit    := NO_LIBAUDIT=1
 make_no_libbionic   := NO_LIBBIONIC=1
 make_no_auxtrace    := NO_AUXTRACE=1
 make_no_libbpf	    := NO_LIBBPF=1
+make_libbpf_dynamic := LIBBPF_DYNAMIC=1
 make_no_libbpf_DEBUG := NO_LIBBPF=1 DEBUG=1
 make_no_libcrypto   := NO_LIBCRYPTO=1
 make_with_babeltrace:= LIBBABELTRACE=1
+make_with_coresight := CORESIGHT=1
 make_no_sdt	    := NO_SDT=1
 make_no_syscall_tbl := NO_SYSCALL_TABLE=1
 make_with_clangllvm := LIBCLANGLLVM=1
@@ -148,11 +150,13 @@ run += make_no_libaudit
 run += make_no_libbionic
 run += make_no_auxtrace
 run += make_no_libbpf
+run += make_libbpf_dynamic
 run += make_no_libbpf_DEBUG
 run += make_no_libcrypto
 run += make_no_sdt
 run += make_no_syscall_tbl
 run += make_with_babeltrace
+run += make_with_coresight
 run += make_with_clangllvm
 run += make_with_libpfm4
 run += make_help
@@ -266,6 +270,9 @@ test_make_install_info_O := $(test_ok)
 test_make_install_pdf    := $(test_ok)
 test_make_install_pdf_O  := $(test_ok)
 
+test_make_libbpf_dynamic :=   ldd $(PERF_O)/perf | grep -q libbpf
+test_make_libbpf_dynamic_O := ldd $$TMP_O/perf | grep -q libbpf
+
 test_make_python_perf_so_O    := test -f $$TMP_O/python/perf.so
 test_make_perf_o_O            := test -f $$TMP_O/perf.o
 test_make_util_map_o_O        := test -f $$TMP_O/util/map.o
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index b85f005308a3..1100dd55b657 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -133,14 +133,12 @@ bool test__bp_account_is_supported(void);
 bool test__wp_is_supported(void);
 bool test__tsc_is_supported(void);
 
-#if defined(__arm__) || defined(__aarch64__)
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
 struct thread;
 struct perf_sample;
 int test__arch_unwind_sample(struct perf_sample *sample,
 			     struct thread *thread);
 #endif
-#endif
 
 #if defined(__arm__)
 int test__vectors_page(struct test *test, int subtest);
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index 1b4091a3b508..62268f8b6c4f 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -1120,8 +1120,9 @@ int auxtrace_queue_data(struct perf_session *session, bool samples, bool events)
 					 auxtrace_queue_data_cb, &qd);
 }
 
-void *auxtrace_buffer__get_data(struct auxtrace_buffer *buffer, int fd)
+void *auxtrace_buffer__get_data_rw(struct auxtrace_buffer *buffer, int fd, bool rw)
 {
+	int prot = rw ? PROT_READ | PROT_WRITE : PROT_READ;
 	size_t adj = buffer->data_offset & (page_size - 1);
 	size_t size = buffer->size + adj;
 	off_t file_offset = buffer->data_offset - adj;
@@ -1130,7 +1131,7 @@ void *auxtrace_buffer__get_data(struct auxtrace_buffer *buffer, int fd)
 	if (buffer->data)
 		return buffer->data;
 
-	addr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, file_offset);
+	addr = mmap(NULL, size, prot, MAP_SHARED, fd, file_offset);
 	if (addr == MAP_FAILED)
 		return NULL;
 
@@ -1569,6 +1570,9 @@ int itrace_parse_synth_opts(const struct option *opt, const char *str,
 		case 'q':
 			synth_opts->quick += 1;
 			break;
+		case 'Z':
+			synth_opts->timeless_decoding = true;
+			break;
 		case ' ':
 		case ',':
 			break;
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
index a4fbb33b7245..472c0973b1f1 100644
--- a/tools/perf/util/auxtrace.h
+++ b/tools/perf/util/auxtrace.h
@@ -89,6 +89,10 @@ enum itrace_period_type {
  * @tlb: whether to synthesize TLB events
  * @remote_access: whether to synthesize remote access events
  * @mem: whether to synthesize memory events
+ * @timeless_decoding: prefer "timeless" decoding i.e. ignore timestamps
+ * @vm_time_correlation: perform VM Time Correlation
+ * @vm_tm_corr_dry_run: VM Time Correlation dry-run
+ * @vm_tm_corr_args:  VM Time Correlation implementation-specific arguments
  * @callchain_sz: maximum callchain size
  * @last_branch_sz: branch context size
  * @period: 'instructions' events period
@@ -128,6 +132,10 @@ struct itrace_synth_opts {
 	bool			tlb;
 	bool			remote_access;
 	bool			mem;
+	bool			timeless_decoding;
+	bool			vm_time_correlation;
+	bool			vm_tm_corr_dry_run;
+	char			*vm_tm_corr_args;
 	unsigned int		callchain_sz;
 	unsigned int		last_branch_sz;
 	unsigned long long	period;
@@ -525,7 +533,11 @@ int auxtrace_queue_data(struct perf_session *session, bool samples,
 			bool events);
 struct auxtrace_buffer *auxtrace_buffer__next(struct auxtrace_queue *queue,
 					      struct auxtrace_buffer *buffer);
-void *auxtrace_buffer__get_data(struct auxtrace_buffer *buffer, int fd);
+void *auxtrace_buffer__get_data_rw(struct auxtrace_buffer *buffer, int fd, bool rw);
+static inline void *auxtrace_buffer__get_data(struct auxtrace_buffer *buffer, int fd)
+{
+	return auxtrace_buffer__get_data_rw(buffer, fd, false);
+}
 void auxtrace_buffer__put_data(struct auxtrace_buffer *buffer);
 void auxtrace_buffer__drop_data(struct auxtrace_buffer *buffer);
 void auxtrace_buffer__free(struct auxtrace_buffer *buffer);
diff --git a/tools/perf/util/cputopo.c b/tools/perf/util/cputopo.c
index 1b52402a8923..ec77e2a7b3ca 100644
--- a/tools/perf/util/cputopo.c
+++ b/tools/perf/util/cputopo.c
@@ -12,6 +12,7 @@
 #include "cpumap.h"
 #include "debug.h"
 #include "env.h"
+#include "pmu-hybrid.h"
 
 #define CORE_SIB_FMT \
 	"%s/devices/system/cpu/cpu%d/topology/core_siblings_list"
@@ -351,3 +352,82 @@ void numa_topology__delete(struct numa_topology *tp)
 
 	free(tp);
 }
+
+static int load_hybrid_node(struct hybrid_topology_node *node,
+			    struct perf_pmu *pmu)
+{
+	const char *sysfs;
+	char path[PATH_MAX];
+	char *buf = NULL, *p;
+	FILE *fp;
+	size_t len = 0;
+
+	node->pmu_name = strdup(pmu->name);
+	if (!node->pmu_name)
+		return -1;
+
+	sysfs = sysfs__mountpoint();
+	if (!sysfs)
+		goto err;
+
+	snprintf(path, PATH_MAX, CPUS_TEMPLATE_CPU, sysfs, pmu->name);
+	fp = fopen(path, "r");
+	if (!fp)
+		goto err;
+
+	if (getline(&buf, &len, fp) <= 0) {
+		fclose(fp);
+		goto err;
+	}
+
+	p = strchr(buf, '\n');
+	if (p)
+		*p = '\0';
+
+	fclose(fp);
+	node->cpus = buf;
+	return 0;
+
+err:
+	zfree(&node->pmu_name);
+	free(buf);
+	return -1;
+}
+
+struct hybrid_topology *hybrid_topology__new(void)
+{
+	struct perf_pmu *pmu;
+	struct hybrid_topology *tp = NULL;
+	u32 nr, i = 0;
+
+	nr = perf_pmu__hybrid_pmu_num();
+	if (nr == 0)
+		return NULL;
+
+	tp = zalloc(sizeof(*tp) + sizeof(tp->nodes[0]) * nr);
+	if (!tp)
+		return NULL;
+
+	tp->nr = nr;
+	perf_pmu__for_each_hybrid_pmu(pmu) {
+		if (load_hybrid_node(&tp->nodes[i], pmu)) {
+			hybrid_topology__delete(tp);
+			return NULL;
+		}
+		i++;
+	}
+
+	return tp;
+}
+
+void hybrid_topology__delete(struct hybrid_topology *tp)
+{
+	u32 i;
+
+	for (i = 0; i < tp->nr; i++) {
+		zfree(&tp->nodes[i].pmu_name);
+		zfree(&tp->nodes[i].cpus);
+	}
+
+	free(tp);
+}
diff --git a/tools/perf/util/cputopo.h b/tools/perf/util/cputopo.h
index 6201c3790d86..d9af97177068 100644
--- a/tools/perf/util/cputopo.h
+++ b/tools/perf/util/cputopo.h
@@ -25,10 +25,23 @@ struct numa_topology {
 	struct numa_topology_node	nodes[];
 };
 
+struct hybrid_topology_node {
+	char		*pmu_name;
+	char		*cpus;
+};
+
+struct hybrid_topology {
+	u32				nr;
+	struct hybrid_topology_node	nodes[];
+};
+
 struct cpu_topology *cpu_topology__new(void);
 void cpu_topology__delete(struct cpu_topology *tp);
 
 struct numa_topology *numa_topology__new(void);
 void numa_topology__delete(struct numa_topology *tp);
 
+struct hybrid_topology *hybrid_topology__new(void);
+void hybrid_topology__delete(struct hybrid_topology *tp);
+
 #endif /* __PERF_CPUTOPO_H */
diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
index 059bcec3f651..3e1a05bc82cc 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
@@ -6,6 +6,7 @@
  * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
  */
 
+#include <asm/bug.h>
 #include <linux/coresight-pmu.h>
 #include <linux/err.h>
 #include <linux/list.h>
@@ -17,6 +18,7 @@
 
 #include "cs-etm.h"
 #include "cs-etm-decoder.h"
+#include "debug.h"
 #include "intlist.h"
 
 /* use raw logging */
@@ -276,13 +278,13 @@ cs_etm_decoder__do_soft_timestamp(struct cs_etm_queue *etmq,
 				  const uint8_t trace_chan_id)
 {
 	/* No timestamp packet has been received, nothing to do */
-	if (!packet_queue->timestamp)
+	if (!packet_queue->cs_timestamp)
 		return OCSD_RESP_CONT;
 
-	packet_queue->timestamp = packet_queue->next_timestamp;
+	packet_queue->cs_timestamp = packet_queue->next_cs_timestamp;
 
 	/* Estimate the timestamp for the next range packet */
-	packet_queue->next_timestamp += packet_queue->instr_count;
+	packet_queue->next_cs_timestamp += packet_queue->instr_count;
 	packet_queue->instr_count = 0;
 
 	/* Tell the front end which traceid_queue needs attention */
@@ -294,7 +296,8 @@ cs_etm_decoder__do_soft_timestamp(struct cs_etm_queue *etmq,
 static ocsd_datapath_resp_t
 cs_etm_decoder__do_hard_timestamp(struct cs_etm_queue *etmq,
 				  const ocsd_generic_trace_elem *elem,
-				  const uint8_t trace_chan_id)
+				  const uint8_t trace_chan_id,
+				  const ocsd_trc_index_t indx)
 {
 	struct cs_etm_packet_queue *packet_queue;
 
@@ -308,20 +311,39 @@ cs_etm_decoder__do_hard_timestamp(struct cs_etm_queue *etmq,
 	 * Function do_soft_timestamp() will report the value to the front end,
 	 * hence asking the decoder to keep decoding rather than stopping.
 	 */
-	if (packet_queue->timestamp) {
-		packet_queue->next_timestamp = elem->timestamp;
+	if (packet_queue->cs_timestamp) {
+		packet_queue->next_cs_timestamp = elem->timestamp;
 		return OCSD_RESP_CONT;
 	}
 
-	/*
-	 * This is the first timestamp we've seen since the beginning of traces
-	 * or a discontinuity.  Since timestamps packets are generated *after*
-	 * range packets have been generated, we need to estimate the time at
-	 * which instructions started by subtracting the number of instructions
-	 * executed to the timestamp.
-	 */
-	packet_queue->timestamp = elem->timestamp - packet_queue->instr_count;
-	packet_queue->next_timestamp = elem->timestamp;
+
+	if (!elem->timestamp) {
+		/*
+		 * Zero timestamps can be seen due to misconfiguration or hardware bugs.
+		 * Warn once, and don't try to subtract instr_count as it would result in an
+		 * underflow.
+		 */
+		packet_queue->cs_timestamp = 0;
+		WARN_ONCE(true, "Zero Coresight timestamp found at Idx:%" OCSD_TRC_IDX_STR
+				". Decoding may be improved with --itrace=Z...\n", indx);
+	} else if (packet_queue->instr_count > elem->timestamp) {
+		/*
+		 * Sanity check that the elem->timestamp - packet_queue->instr_count would not
+		 * result in an underflow. Warn and clamp at 0 if it would.
+		 */
+		packet_queue->cs_timestamp = 0;
+		pr_err("Timestamp calculation underflow at Idx:%" OCSD_TRC_IDX_STR "\n", indx);
+	} else {
+		/*
+		 * This is the first timestamp we've seen since the beginning of traces
+		 * or a discontinuity.  Since timestamps packets are generated *after*
+		 * range packets have been generated, we need to estimate the time at
+		 * which instructions started by subtracting the number of instructions
+		 * executed to the timestamp.
+		 */
+		packet_queue->cs_timestamp = elem->timestamp - packet_queue->instr_count;
+	}
+	packet_queue->next_cs_timestamp = elem->timestamp;
 	packet_queue->instr_count = 0;
 
 	/* Tell the front end which traceid_queue needs attention */
@@ -334,8 +356,8 @@ cs_etm_decoder__do_hard_timestamp(struct cs_etm_queue *etmq,
 static void
 cs_etm_decoder__reset_timestamp(struct cs_etm_packet_queue *packet_queue)
 {
-	packet_queue->timestamp = 0;
-	packet_queue->next_timestamp = 0;
+	packet_queue->cs_timestamp = 0;
+	packet_queue->next_cs_timestamp = 0;
 	packet_queue->instr_count = 0;
 }
 
@@ -542,7 +564,7 @@ cs_etm_decoder__set_tid(struct cs_etm_queue *etmq,
 
 static ocsd_datapath_resp_t cs_etm_decoder__gen_trace_elem_printer(
 				const void *context,
-				const ocsd_trc_index_t indx __maybe_unused,
+				const ocsd_trc_index_t indx,
 				const u8 trace_chan_id __maybe_unused,
 				const ocsd_generic_trace_elem *elem)
 {
@@ -579,7 +601,8 @@ static ocsd_datapath_resp_t cs_etm_decoder__gen_trace_elem_printer(
 		break;
 	case OCSD_GEN_TRC_ELEM_TIMESTAMP:
 		resp = cs_etm_decoder__do_hard_timestamp(etmq, elem,
-							 trace_chan_id);
+							 trace_chan_id,
+							 indx);
 		break;
 	case OCSD_GEN_TRC_ELEM_PE_CONTEXT:
 		resp = cs_etm_decoder__set_tid(etmq, packet_queue,
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 7e63e7dedc33..64536a6ed10a 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -38,8 +38,6 @@
 #include <tools/libc_compat.h>
 #include "util/synthetic-events.h"
 
-#define MAX_TIMESTAMP (~0ULL)
-
 struct cs_etm_auxtrace {
 	struct auxtrace auxtrace;
 	struct auxtrace_queues queues;
@@ -56,6 +54,7 @@ struct cs_etm_auxtrace {
 	u8 sample_instructions;
 
 	int num_cpu;
+	u64 latest_kernel_timestamp;
 	u32 auxtrace_type;
 	u64 branches_sample_type;
 	u64 branches_id;
@@ -86,7 +85,7 @@ struct cs_etm_queue {
 	struct cs_etm_decoder *decoder;
 	struct auxtrace_buffer *buffer;
 	unsigned int queue_nr;
-	u8 pending_timestamp;
+	u8 pending_timestamp_chan_id;
 	u64 offset;
 	const unsigned char *buf;
 	size_t buf_len, buf_used;
@@ -208,7 +207,7 @@ void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq,
 	 * be more than one channel per cs_etm_queue, we need to specify
 	 * what traceID queue needs servicing.
 	 */
-	etmq->pending_timestamp = trace_chan_id;
+	etmq->pending_timestamp_chan_id = trace_chan_id;
 }
 
 static u64 cs_etm__etmq_get_timestamp(struct cs_etm_queue *etmq,
@@ -216,22 +215,22 @@ static u64 cs_etm__etmq_get_timestamp(struct cs_etm_queue *etmq,
 {
 	struct cs_etm_packet_queue *packet_queue;
 
-	if (!etmq->pending_timestamp)
+	if (!etmq->pending_timestamp_chan_id)
 		return 0;
 
 	if (trace_chan_id)
-		*trace_chan_id = etmq->pending_timestamp;
+		*trace_chan_id = etmq->pending_timestamp_chan_id;
 
 	packet_queue = cs_etm__etmq_get_packet_queue(etmq,
-						     etmq->pending_timestamp);
+						     etmq->pending_timestamp_chan_id);
 	if (!packet_queue)
 		return 0;
 
 	/* Acknowledge pending status */
-	etmq->pending_timestamp = 0;
+	etmq->pending_timestamp_chan_id = 0;
 
 	/* See function cs_etm_decoder__do_{hard|soft}_timestamp() */
-	return packet_queue->timestamp;
+	return packet_queue->cs_timestamp;
 }
 
 static void cs_etm__clear_packet_queue(struct cs_etm_packet_queue *queue)
@@ -814,7 +813,7 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
 	int ret = 0;
 	unsigned int cs_queue_nr;
 	u8 trace_chan_id;
-	u64 timestamp;
+	u64 cs_timestamp;
 	struct cs_etm_queue *etmq = queue->priv;
 
 	if (list_empty(&queue->head) || etmq)
@@ -854,7 +853,7 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
 
 		/*
 		 * Run decoder on the trace block.  The decoder will stop when
-		 * encountering a timestamp, a full packet queue or the end of
+		 * encountering a CS timestamp, a full packet queue or the end of
 		 * trace for that block.
 		 */
 		ret = cs_etm__decode_data_block(etmq);
@@ -865,10 +864,10 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
 		 * Function cs_etm_decoder__do_{hard|soft}_timestamp() does all
 		 * the timestamp calculation for us.
 		 */
-		timestamp = cs_etm__etmq_get_timestamp(etmq, &trace_chan_id);
+		cs_timestamp = cs_etm__etmq_get_timestamp(etmq, &trace_chan_id);
 
 		/* We found a timestamp, no need to continue. */
-		if (timestamp)
+		if (cs_timestamp)
 			break;
 
 		/*
@@ -892,7 +891,7 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm,
 	 * queue and will be processed in cs_etm__process_queues().
 	 */
 	cs_queue_nr = TO_CS_QUEUE_NR(queue_nr, trace_chan_id);
-	ret = auxtrace_heap__add(&etm->heap, cs_queue_nr, timestamp);
+	ret = auxtrace_heap__add(&etm->heap, cs_queue_nr, cs_timestamp);
 out:
 	return ret;
 }
@@ -1194,6 +1193,8 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
 	event->sample.header.misc = cs_etm__cpu_mode(etmq, addr);
 	event->sample.header.size = sizeof(struct perf_event_header);
 
+	if (!etm->timeless_decoding)
+		sample.time = etm->latest_kernel_timestamp;
 	sample.ip = addr;
 	sample.pid = tidq->pid;
 	sample.tid = tidq->tid;
@@ -1250,6 +1251,8 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
 	event->sample.header.misc = cs_etm__cpu_mode(etmq, ip);
 	event->sample.header.size = sizeof(struct perf_event_header);
 
+	if (!etm->timeless_decoding)
+		sample.time = etm->latest_kernel_timestamp;
 	sample.ip = ip;
 	sample.pid = tidq->pid;
 	sample.tid = tidq->tid;
@@ -2221,7 +2224,7 @@ static int cs_etm__process_queues(struct cs_etm_auxtrace *etm)
 	int ret = 0;
 	unsigned int cs_queue_nr, queue_nr;
 	u8 trace_chan_id;
-	u64 timestamp;
+	u64 cs_timestamp;
 	struct auxtrace_queue *queue;
 	struct cs_etm_queue *etmq;
 	struct cs_etm_traceid_queue *tidq;
@@ -2283,9 +2286,9 @@ refetch:
 		if (ret)
 			goto out;
 
-		timestamp = cs_etm__etmq_get_timestamp(etmq, &trace_chan_id);
+		cs_timestamp = cs_etm__etmq_get_timestamp(etmq, &trace_chan_id);
 
-		if (!timestamp) {
+		if (!cs_timestamp) {
 			/*
 			 * Function cs_etm__decode_data_block() returns when
 			 * there is no more traces to decode in the current
@@ -2308,7 +2311,7 @@ refetch:
 		 * this queue/traceID.
 		 */
 		cs_queue_nr = TO_CS_QUEUE_NR(queue_nr, trace_chan_id);
-		ret = auxtrace_heap__add(&etm->heap, cs_queue_nr, timestamp);
+		ret = auxtrace_heap__add(&etm->heap, cs_queue_nr, cs_timestamp);
 	}
 
 out:
@@ -2380,7 +2383,7 @@ static int cs_etm__process_event(struct perf_session *session,
 				 struct perf_tool *tool)
 {
 	int err = 0;
-	u64 timestamp;
+	u64 sample_kernel_timestamp;
 	struct cs_etm_auxtrace *etm = container_of(session->auxtrace,
 						   struct cs_etm_auxtrace,
 						   auxtrace);
@@ -2394,11 +2397,11 @@ static int cs_etm__process_event(struct perf_session *session,
 	}
 
 	if (sample->time && (sample->time != (u64) -1))
-		timestamp = sample->time;
+		sample_kernel_timestamp = sample->time;
 	else
-		timestamp = 0;
+		sample_kernel_timestamp = 0;
 
-	if (timestamp || etm->timeless_decoding) {
+	if (sample_kernel_timestamp || etm->timeless_decoding) {
 		err = cs_etm__update_queues(etm);
 		if (err)
 			return err;
@@ -2414,9 +2417,15 @@ static int cs_etm__process_event(struct perf_session *session,
 	else if (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE)
 		return cs_etm__process_switch_cpu_wide(etm, event);
 
-	if (!etm->timeless_decoding &&
-	    event->header.type == PERF_RECORD_AUX)
+	if (!etm->timeless_decoding && event->header.type == PERF_RECORD_AUX) {
+		/*
+		 * Record the latest kernel timestamp available in the header
+		 * for samples so that synthesised samples occur from this point
+		 * onwards.
+		 */
+		etm->latest_kernel_timestamp = sample_kernel_timestamp;
 		return cs_etm__process_queues(etm);
+	}
 
 	return 0;
 }
@@ -2464,6 +2473,10 @@ static bool cs_etm__is_timeless_decoding(struct cs_etm_auxtrace *etm)
 	struct evlist *evlist = etm->session->evlist;
 	bool timeless_decoding = true;
 
+	/* Override timeless mode with user input from --itrace=Z */
+	if (etm->synth_opts.timeless_decoding)
+		return true;
+
 	/*
 	 * Circle through the list of event and complain if we find one
 	 * with the time bit set.
@@ -2810,6 +2823,14 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
 	if (err)
 		goto err_free_etm;
 
+	if (session->itrace_synth_opts->set) {
+		etm->synth_opts = *session->itrace_synth_opts;
+	} else {
+		itrace_synth_opts__set_default(&etm->synth_opts,
+				session->itrace_synth_opts->default_no_sample);
+		etm->synth_opts.callchain = false;
+	}
+
 	etm->session = session;
 	etm->machine = &session->machines.host;
 
@@ -2854,14 +2875,6 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
 		return 0;
 	}
 
-	if (session->itrace_synth_opts->set) {
-		etm->synth_opts = *session->itrace_synth_opts;
-	} else {
-		itrace_synth_opts__set_default(&etm->synth_opts,
-				session->itrace_synth_opts->default_no_sample);
-		etm->synth_opts.callchain = false;
-	}
-
 	err = cs_etm__synth_events(etm, session);
 	if (err)
 		goto err_delete_thread;
diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h
index 36428918411e..d65c7b19407d 100644
--- a/tools/perf/util/cs-etm.h
+++ b/tools/perf/util/cs-etm.h
@@ -171,8 +171,8 @@ struct cs_etm_packet_queue {
 	u32 head;
 	u32 tail;
 	u32 instr_count;
-	u64 timestamp;
-	u64 next_timestamp;
+	u64 cs_timestamp;
+	u64 next_cs_timestamp;
 	struct cs_etm_packet packet_buffer[CS_ETM_PACKET_MAX_BUFFER];
 };
 
diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
index 8fca4779ae6a..a9c102e8e3c0 100644
--- a/tools/perf/util/data.c
+++ b/tools/perf/util/data.c
@@ -240,11 +240,12 @@ static bool is_dir(struct perf_data *data)
 
 static int open_file_read(struct perf_data *data)
 {
+	int flags = data->in_place_update ? O_RDWR : O_RDONLY;
 	struct stat st;
 	int fd;
 	char sbuf[STRERR_BUFSIZE];
 
-	fd = open(data->file.path, O_RDONLY);
+	fd = open(data->file.path, flags);
 	if (fd < 0) {
 		int err = errno;
 
diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h
index 62a3e66fbee8..c9de82af5584 100644
--- a/tools/perf/util/data.h
+++ b/tools/perf/util/data.h
@@ -31,6 +31,7 @@ struct perf_data {
 	bool			 is_dir;
 	bool			 force;
 	bool			 use_stdio;
+	bool			 in_place_update;
 	enum perf_data_mode	 mode;
 
 	struct {
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index 9130f6fad8d5..1bea5b29b12d 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -202,6 +202,18 @@ void perf_env__exit(struct perf_env *env)
 	for (i = 0; i < env->nr_memory_nodes; i++)
 		zfree(&env->memory_nodes[i].set);
 	zfree(&env->memory_nodes);
+
+	for (i = 0; i < env->nr_hybrid_nodes; i++) {
+		zfree(&env->hybrid_nodes[i].pmu_name);
+		zfree(&env->hybrid_nodes[i].cpus);
+	}
+	zfree(&env->hybrid_nodes);
+
+	for (i = 0; i < env->nr_hybrid_cpc_nodes; i++) {
+		zfree(&env->hybrid_cpc_nodes[i].cpu_pmu_caps);
+		zfree(&env->hybrid_cpc_nodes[i].pmu_name);
+	}
+	zfree(&env->hybrid_cpc_nodes);
 }
 
 void perf_env__init(struct perf_env *env __maybe_unused)
diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h
index ca249bf5e984..6824a7423a2d 100644
--- a/tools/perf/util/env.h
+++ b/tools/perf/util/env.h
@@ -37,6 +37,18 @@ struct memory_node {
 	unsigned long	*set;
 };
 
+struct hybrid_node {
+	char	*pmu_name;
+	char	*cpus;
+};
+
+struct hybrid_cpc_node {
+	int		nr_cpu_pmu_caps;
+	unsigned int    max_branches;
+	char            *cpu_pmu_caps;
+	char            *pmu_name;
+};
+
 struct perf_env {
 	char			*hostname;
 	char			*os_release;
@@ -59,6 +71,8 @@ struct perf_env {
 	int			nr_pmu_mappings;
 	int			nr_groups;
 	int			nr_cpu_pmu_caps;
+	int			nr_hybrid_nodes;
+	int			nr_hybrid_cpc_nodes;
 	char			*cmdline;
 	const char		**cmdline_argv;
 	char			*sibling_cores;
@@ -77,6 +91,8 @@ struct perf_env {
 	struct numa_node	*numa_nodes;
 	struct memory_node	*memory_nodes;
 	unsigned long long	 memory_bsize;
+	struct hybrid_node	*hybrid_nodes;
+	struct hybrid_cpc_node	*hybrid_cpc_nodes;
 #ifdef HAVE_LIBBPF_SUPPORT
 	/*
 	 * bpf_info_lock protects bpf rbtrees. This is needed because the
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index aa1e42518d37..0158d2945bab 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -49,6 +49,7 @@
 #include "cputopo.h"
 #include "bpf-event.h"
 #include "clockid.h"
+#include "pmu-hybrid.h"
 
 #include <linux/ctype.h>
 #include <internal/lib.h>
@@ -932,6 +933,40 @@ static int write_clock_data(struct feat_fd *ff,
 	return do_write(ff, data64, sizeof(*data64));
 }
 
+static int write_hybrid_topology(struct feat_fd *ff,
+				 struct evlist *evlist __maybe_unused)
+{
+	struct hybrid_topology *tp;
+	int ret;
+	u32 i;
+
+	tp = hybrid_topology__new();
+	if (!tp)
+		return -ENOENT;
+
+	ret = do_write(ff, &tp->nr, sizeof(u32));
+	if (ret < 0)
+		goto err;
+
+	for (i = 0; i < tp->nr; i++) {
+		struct hybrid_topology_node *n = &tp->nodes[i];
+
+		ret = do_write_string(ff, n->pmu_name);
+		if (ret < 0)
+			goto err;
+
+		ret = do_write_string(ff, n->cpus);
+		if (ret < 0)
+			goto err;
+	}
+
+	ret = 0;
+
+err:
+	hybrid_topology__delete(tp);
+	return ret;
+}
+
 static int write_dir_format(struct feat_fd *ff,
 			    struct evlist *evlist __maybe_unused)
 {
@@ -1425,18 +1460,14 @@ static int write_compressed(struct feat_fd *ff __maybe_unused,
 	return do_write(ff, &(ff->ph->env.comp_mmap_len), sizeof(ff->ph->env.comp_mmap_len));
 }
 
-static int write_cpu_pmu_caps(struct feat_fd *ff,
-			      struct evlist *evlist __maybe_unused)
+static int write_per_cpu_pmu_caps(struct feat_fd *ff, struct perf_pmu *pmu,
+				  bool write_pmu)
 {
-	struct perf_pmu *cpu_pmu = perf_pmu__find("cpu");
 	struct perf_pmu_caps *caps = NULL;
 	int nr_caps;
 	int ret;
 
-	if (!cpu_pmu)
-		return -ENOENT;
-
-	nr_caps = perf_pmu__caps_parse(cpu_pmu);
+	nr_caps = perf_pmu__caps_parse(pmu);
 	if (nr_caps < 0)
 		return nr_caps;
 
@@ -1444,7 +1475,7 @@ static int write_cpu_pmu_caps(struct feat_fd *ff,
 	if (ret < 0)
 		return ret;
 
-	list_for_each_entry(caps, &cpu_pmu->caps, list) {
+	list_for_each_entry(caps, &pmu->caps, list) {
 		ret = do_write_string(ff, caps->name);
 		if (ret < 0)
 			return ret;
@@ -1454,9 +1485,49 @@ static int write_cpu_pmu_caps(struct feat_fd *ff,
 			return ret;
 	}
 
+	if (write_pmu) {
+		ret = do_write_string(ff, pmu->name);
+		if (ret < 0)
+			return ret;
+	}
+
 	return ret;
 }
 
+static int write_cpu_pmu_caps(struct feat_fd *ff,
+			      struct evlist *evlist __maybe_unused)
+{
+	struct perf_pmu *cpu_pmu = perf_pmu__find("cpu");
+
+	if (!cpu_pmu)
+		return -ENOENT;
+
+	return write_per_cpu_pmu_caps(ff, cpu_pmu, false);
+}
+
+static int write_hybrid_cpu_pmu_caps(struct feat_fd *ff,
+				     struct evlist *evlist __maybe_unused)
+{
+	struct perf_pmu *pmu;
+	u32 nr_pmu = perf_pmu__hybrid_pmu_num();
+	int ret;
+
+	if (nr_pmu == 0)
+		return -ENOENT;
+
+	ret = do_write(ff, &nr_pmu, sizeof(nr_pmu));
+	if (ret < 0)
+		return ret;
+
+	perf_pmu__for_each_hybrid_pmu(pmu) {
+		ret = write_per_cpu_pmu_caps(ff, pmu, true);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
 static void print_hostname(struct feat_fd *ff, FILE *fp)
 {
 	fprintf(fp, "# hostname : %s\n", ff->ph->env.hostname);
@@ -1623,6 +1694,18 @@ static void print_clock_data(struct feat_fd *ff, FILE *fp)
 		    clockid_name(clockid));
 }
 
+static void print_hybrid_topology(struct feat_fd *ff, FILE *fp)
+{
+	int i;
+	struct hybrid_node *n;
+
+	fprintf(fp, "# hybrid cpu system:\n");
+	for (i = 0; i < ff->ph->env.nr_hybrid_nodes; i++) {
+		n = &ff->ph->env.hybrid_nodes[i];
+		fprintf(fp, "# %s cpu list : %s\n", n->pmu_name, n->cpus);
+	}
+}
+
 static void print_dir_format(struct feat_fd *ff, FILE *fp)
 {
 	struct perf_session *session;
@@ -1916,18 +1999,28 @@ static void print_compressed(struct feat_fd *ff, FILE *fp)
 		ff->ph->env.comp_level, ff->ph->env.comp_ratio);
 }
 
-static void print_cpu_pmu_caps(struct feat_fd *ff, FILE *fp)
+static void print_per_cpu_pmu_caps(FILE *fp, int nr_caps, char *cpu_pmu_caps,
+				   char *pmu_name)
 {
-	const char *delimiter = "# cpu pmu capabilities: ";
-	u32 nr_caps = ff->ph->env.nr_cpu_pmu_caps;
-	char *str;
+	const char *delimiter;
+	char *str, buf[128];
 
 	if (!nr_caps) {
-		fprintf(fp, "# cpu pmu capabilities: not available\n");
+		if (!pmu_name)
+			fprintf(fp, "# cpu pmu capabilities: not available\n");
+		else
+			fprintf(fp, "# %s pmu capabilities: not available\n", pmu_name);
 		return;
 	}
 
-	str = ff->ph->env.cpu_pmu_caps;
+	if (!pmu_name)
+		scnprintf(buf, sizeof(buf), "# cpu pmu capabilities: ");
+	else
+		scnprintf(buf, sizeof(buf), "# %s pmu capabilities: ", pmu_name);
+
+	delimiter = buf;
+
+	str = cpu_pmu_caps;
 	while (nr_caps--) {
 		fprintf(fp, "%s%s", delimiter, str);
 		delimiter = ", ";
@@ -1937,6 +2030,24 @@ static void print_cpu_pmu_caps(struct feat_fd *ff, FILE *fp)
 	fprintf(fp, "\n");
 }
 
+static void print_cpu_pmu_caps(struct feat_fd *ff, FILE *fp)
+{
+	print_per_cpu_pmu_caps(fp, ff->ph->env.nr_cpu_pmu_caps,
+			       ff->ph->env.cpu_pmu_caps, NULL);
+}
+
+static void print_hybrid_cpu_pmu_caps(struct feat_fd *ff, FILE *fp)
+{
+	struct hybrid_cpc_node *n;
+
+	for (int i = 0; i < ff->ph->env.nr_hybrid_cpc_nodes; i++) {
+		n = &ff->ph->env.hybrid_cpc_nodes[i];
+		print_per_cpu_pmu_caps(fp, n->nr_cpu_pmu_caps,
+				       n->cpu_pmu_caps,
+				       n->pmu_name);
+	}
+}
+
 static void print_pmu_mappings(struct feat_fd *ff, FILE *fp)
 {
 	const char *delimiter = "# pmu mappings: ";
@@ -2849,6 +2960,46 @@ static int process_clock_data(struct feat_fd *ff,
 	return 0;
 }
 
+static int process_hybrid_topology(struct feat_fd *ff,
+				   void *data __maybe_unused)
+{
+	struct hybrid_node *nodes, *n;
+	u32 nr, i;
+
+	/* nr nodes */
+	if (do_read_u32(ff, &nr))
+		return -1;
+
+	nodes = zalloc(sizeof(*nodes) * nr);
+	if (!nodes)
+		return -ENOMEM;
+
+	for (i = 0; i < nr; i++) {
+		n = &nodes[i];
+
+		n->pmu_name = do_read_string(ff);
+		if (!n->pmu_name)
+			goto error;
+
+		n->cpus = do_read_string(ff);
+		if (!n->cpus)
+			goto error;
+	}
+
+	ff->ph->env.nr_hybrid_nodes = nr;
+	ff->ph->env.hybrid_nodes = nodes;
+	return 0;
+
+error:
+	for (i = 0; i < nr; i++) {
+		free(nodes[i].pmu_name);
+		free(nodes[i].cpus);
+	}
+
+	free(nodes);
+	return -1;
+}
+
 static int process_dir_format(struct feat_fd *ff,
 			      void *_data __maybe_unused)
 {
@@ -3002,8 +3153,9 @@ static int process_compressed(struct feat_fd *ff,
 	return 0;
 }
 
-static int process_cpu_pmu_caps(struct feat_fd *ff,
-				void *data __maybe_unused)
+static int process_per_cpu_pmu_caps(struct feat_fd *ff, int *nr_cpu_pmu_caps,
+				    char **cpu_pmu_caps,
+				    unsigned int *max_branches)
 {
 	char *name, *value;
 	struct strbuf sb;
@@ -3017,7 +3169,7 @@ static int process_cpu_pmu_caps(struct feat_fd *ff,
 		return 0;
 	}
 
-	ff->ph->env.nr_cpu_pmu_caps = nr_caps;
+	*nr_cpu_pmu_caps = nr_caps;
 
 	if (strbuf_init(&sb, 128) < 0)
 		return -1;
@@ -3039,12 +3191,12 @@ static int process_cpu_pmu_caps(struct feat_fd *ff,
 			goto free_value;
 
 		if (!strcmp(name, "branches"))
-			ff->ph->env.max_branches = atoi(value);
+			*max_branches = atoi(value);
 
 		free(value);
 		free(name);
 	}
-	ff->ph->env.cpu_pmu_caps = strbuf_detach(&sb, NULL);
+	*cpu_pmu_caps = strbuf_detach(&sb, NULL);
 	return 0;
 
 free_value:
@@ -3056,6 +3208,63 @@ error:
 	return -1;
 }
 
+static int process_cpu_pmu_caps(struct feat_fd *ff,
+				void *data __maybe_unused)
+{
+	return process_per_cpu_pmu_caps(ff, &ff->ph->env.nr_cpu_pmu_caps,
+					&ff->ph->env.cpu_pmu_caps,
+					&ff->ph->env.max_branches);
+}
+
+static int process_hybrid_cpu_pmu_caps(struct feat_fd *ff,
+				       void *data __maybe_unused)
+{
+	struct hybrid_cpc_node *nodes;
+	u32 nr_pmu, i;
+	int ret;
+
+	if (do_read_u32(ff, &nr_pmu))
+		return -1;
+
+	if (!nr_pmu) {
+		pr_debug("hybrid cpu pmu capabilities not available\n");
+		return 0;
+	}
+
+	nodes = zalloc(sizeof(*nodes) * nr_pmu);
+	if (!nodes)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_pmu; i++) {
+		struct hybrid_cpc_node *n = &nodes[i];
+
+		ret = process_per_cpu_pmu_caps(ff, &n->nr_cpu_pmu_caps,
+					       &n->cpu_pmu_caps,
+					       &n->max_branches);
+		if (ret)
+			goto err;
+
+		n->pmu_name = do_read_string(ff);
+		if (!n->pmu_name) {
+			ret = -1;
+			goto err;
+		}
+	}
+
+	ff->ph->env.nr_hybrid_cpc_nodes = nr_pmu;
+	ff->ph->env.hybrid_cpc_nodes = nodes;
+	return 0;
+
+err:
+	for (i = 0; i < nr_pmu; i++) {
+		free(nodes[i].cpu_pmu_caps);
+		free(nodes[i].pmu_name);
+	}
+
+	free(nodes);
+	return ret;
+}
+
 #define FEAT_OPR(n, func, __full_only) \
 	[HEADER_##n] = {					\
 		.name	    = __stringify(n),			\
@@ -3117,6 +3326,8 @@ const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = {
 	FEAT_OPR(COMPRESSED,	compressed,	false),
 	FEAT_OPR(CPU_PMU_CAPS,	cpu_pmu_caps,	false),
 	FEAT_OPR(CLOCK_DATA,	clock_data,	false),
+	FEAT_OPN(HYBRID_TOPOLOGY,	hybrid_topology,	true),
+	FEAT_OPR(HYBRID_CPU_PMU_CAPS,	hybrid_cpu_pmu_caps,	false),
 };
 
 struct header_print_data {
@@ -3814,6 +4025,11 @@ int perf_session__read_header(struct perf_session *session)
 	if (perf_file_header__read(&f_header, header, fd) < 0)
 		return -EINVAL;
 
+	if (header->needs_swap && data->in_place_update) {
+		pr_err("In-place update not supported when byte-swapping is required\n");
+		return -EINVAL;
+	}
+
 	/*
 	 * Sanity check that perf.data was written cleanly; data size is
 	 * initialized to 0 and updated only if the on_exit function is run.
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 2aca71763ecf..ae6b1cf19a7d 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -45,6 +45,8 @@ enum {
 	HEADER_COMPRESSED,
 	HEADER_CPU_PMU_CAPS,
 	HEADER_CLOCK_DATA,
+	HEADER_HYBRID_TOPOLOGY,
+	HEADER_HYBRID_CPU_PMU_CAPS,
 	HEADER_LAST_FEATURE,
 	HEADER_FEAT_BITS	= 256,
 };
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
index 20ad663978cc..cb2520abf261 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
@@ -35,6 +35,10 @@
 
 #define BIT63 (((uint64_t)1 << 63))
 
+#define SEVEN_BYTES 0xffffffffffffffULL
+
+#define NO_VMCS 0xffffffffffULL
+
 #define INTEL_PT_RETURN 1
 
 /* Maximum number of loops with no packets consumed i.e. stuck in a loop */
@@ -51,6 +55,11 @@ struct intel_pt_stack {
 	int pos;
 };
 
+enum intel_pt_p_once {
+	INTEL_PT_PRT_ONCE_UNK_VMCS,
+	INTEL_PT_PRT_ONCE_ERANGE,
+};
+
 enum intel_pt_pkt_state {
 	INTEL_PT_STATE_NO_PSB,
 	INTEL_PT_STATE_NO_IP,
@@ -64,6 +73,7 @@ enum intel_pt_pkt_state {
 	INTEL_PT_STATE_FUP_NO_TIP,
 	INTEL_PT_STATE_FUP_IN_PSB,
 	INTEL_PT_STATE_RESAMPLE,
+	INTEL_PT_STATE_VM_TIME_CORRELATION,
 };
 
 static inline bool intel_pt_sample_time(enum intel_pt_pkt_state pkt_state)
@@ -75,6 +85,7 @@ static inline bool intel_pt_sample_time(enum intel_pt_pkt_state pkt_state)
 	case INTEL_PT_STATE_IN_SYNC:
 	case INTEL_PT_STATE_TNT_CONT:
 	case INTEL_PT_STATE_RESAMPLE:
+	case INTEL_PT_STATE_VM_TIME_CORRELATION:
 		return true;
 	case INTEL_PT_STATE_TNT:
 	case INTEL_PT_STATE_TIP:
@@ -107,6 +118,7 @@ struct intel_pt_decoder {
 			 uint64_t max_insn_cnt, void *data);
 	bool (*pgd_ip)(uint64_t ip, void *data);
 	int (*lookahead)(void *data, intel_pt_lookahead_cb_t cb, void *cb_data);
+	struct intel_pt_vmcs_info *(*findnew_vmcs_info)(void *data, uint64_t vmcs);
 	void *data;
 	struct intel_pt_state state;
 	const unsigned char *buf;
@@ -122,6 +134,11 @@ struct intel_pt_decoder {
 	bool in_psb;
 	bool hop;
 	bool leap;
+	bool vm_time_correlation;
+	bool vm_tm_corr_dry_run;
+	bool vm_tm_corr_reliable;
+	bool vm_tm_corr_same_buf;
+	bool vm_tm_corr_continuous;
 	bool nr;
 	bool next_nr;
 	enum intel_pt_param_flags flags;
@@ -139,6 +156,11 @@ struct intel_pt_decoder {
 	uint64_t ctc_delta;
 	uint64_t cycle_cnt;
 	uint64_t cyc_ref_timestamp;
+	uint64_t first_timestamp;
+	uint64_t last_reliable_timestamp;
+	uint64_t vmcs;
+	uint64_t print_once;
+	uint64_t last_ctc;
 	uint32_t last_mtc;
 	uint32_t tsc_ctc_ratio_n;
 	uint32_t tsc_ctc_ratio_d;
@@ -217,6 +239,31 @@ static uint64_t intel_pt_lower_power_of_2(uint64_t x)
 	return x << i;
 }
 
+__printf(1, 2)
+static void p_log(const char *fmt, ...)
+{
+	char buf[512];
+	va_list args;
+
+	va_start(args, fmt);
+	vsnprintf(buf, sizeof(buf), fmt, args);
+	va_end(args);
+
+	fprintf(stderr, "%s\n", buf);
+	intel_pt_log("%s\n", buf);
+}
+
+static bool intel_pt_print_once(struct intel_pt_decoder *decoder,
+				enum intel_pt_p_once id)
+{
+	uint64_t bit = 1ULL << id;
+
+	if (decoder->print_once & bit)
+		return false;
+	decoder->print_once |= bit;
+	return true;
+}
+
 static uint64_t intel_pt_cyc_threshold(uint64_t ctl)
 {
 	if (!(ctl & INTEL_PT_CYC_ENABLE))
@@ -258,11 +305,16 @@ struct intel_pt_decoder *intel_pt_decoder_new(struct intel_pt_params *params)
 	decoder->walk_insn          = params->walk_insn;
 	decoder->pgd_ip             = params->pgd_ip;
 	decoder->lookahead          = params->lookahead;
+	decoder->findnew_vmcs_info  = params->findnew_vmcs_info;
 	decoder->data               = params->data;
 	decoder->return_compression = params->return_compression;
 	decoder->branch_enable      = params->branch_enable;
 	decoder->hop                = params->quick >= 1;
 	decoder->leap               = params->quick >= 2;
+	decoder->vm_time_correlation = params->vm_time_correlation;
+	decoder->vm_tm_corr_dry_run = params->vm_tm_corr_dry_run;
+	decoder->first_timestamp    = params->first_timestamp;
+	decoder->last_reliable_timestamp = params->first_timestamp;
 
 	decoder->flags              = params->flags;
 
@@ -312,6 +364,12 @@ struct intel_pt_decoder *intel_pt_decoder_new(struct intel_pt_params *params)
 	return decoder;
 }
 
+void intel_pt_set_first_timestamp(struct intel_pt_decoder *decoder,
+				  uint64_t first_timestamp)
+{
+	decoder->first_timestamp = first_timestamp;
+}
+
 static void intel_pt_pop_blk(struct intel_pt_stack *stack)
 {
 	struct intel_pt_blk *blk = stack->blk;
@@ -577,6 +635,7 @@ static int intel_pt_get_data(struct intel_pt_decoder *decoder, bool reposition)
 		intel_pt_reposition(decoder);
 		decoder->ref_timestamp = buffer.ref_timestamp;
 		decoder->state.trace_nr = buffer.trace_nr;
+		decoder->vm_tm_corr_same_buf = false;
 		intel_pt_log("Reference timestamp 0x%" PRIx64 "\n",
 			     decoder->ref_timestamp);
 		return -ENOLINK;
@@ -1469,9 +1528,24 @@ static uint64_t intel_pt_8b_tsc(uint64_t timestamp, uint64_t ref_timestamp)
 	return timestamp;
 }
 
+/* For use only when decoder->vm_time_correlation is true */
+static bool intel_pt_time_in_range(struct intel_pt_decoder *decoder,
+				   uint64_t timestamp)
+{
+	uint64_t max_timestamp = decoder->buf_timestamp;
+
+	if (!max_timestamp) {
+		max_timestamp = decoder->last_reliable_timestamp +
+				0x400000000ULL;
+	}
+	return timestamp >= decoder->last_reliable_timestamp &&
+	       timestamp < decoder->buf_timestamp;
+}
+
 static void intel_pt_calc_tsc_timestamp(struct intel_pt_decoder *decoder)
 {
 	uint64_t timestamp;
+	bool bad = false;
 
 	decoder->have_tma = false;
 
@@ -1493,10 +1567,21 @@ static void intel_pt_calc_tsc_timestamp(struct intel_pt_decoder *decoder)
 			timestamp = decoder->timestamp;
 		}
 		if (timestamp < decoder->timestamp) {
-			intel_pt_log_to("Wraparound timestamp", timestamp);
-			timestamp += (1ULL << 56);
-			decoder->tsc_timestamp = timestamp;
+			if (!decoder->buf_timestamp ||
+			    (timestamp + (1ULL << 56) < decoder->buf_timestamp)) {
+				intel_pt_log_to("Wraparound timestamp", timestamp);
+				timestamp += (1ULL << 56);
+				decoder->tsc_timestamp = timestamp;
+			} else {
+				intel_pt_log_to("Suppressing bad timestamp", timestamp);
+				timestamp = decoder->timestamp;
+				bad = true;
+			}
 		}
+		if (decoder->vm_time_correlation &&
+		    (bad || !intel_pt_time_in_range(decoder, timestamp)) &&
+		    intel_pt_print_once(decoder, INTEL_PT_PRT_ONCE_ERANGE))
+			p_log("Timestamp out of range");
 		decoder->timestamp = timestamp;
 		decoder->timestamp_insn_cnt = 0;
 	}
@@ -1573,6 +1658,7 @@ static void intel_pt_calc_tma(struct intel_pt_decoder *decoder)
 		intel_pt_mtc_cyc_cnt_upd(decoder);
 
 	decoder->last_mtc = (ctc >> decoder->mtc_shift) & 0xff;
+	decoder->last_ctc = ctc - ctc_rem;
 	decoder->ctc_timestamp = decoder->tsc_timestamp - fc;
 	if (decoder->tsc_ctc_mult) {
 		decoder->ctc_timestamp -= ctc_rem * decoder->tsc_ctc_mult;
@@ -1957,6 +2043,613 @@ static int intel_pt_resample(struct intel_pt_decoder *decoder)
 	return 0;
 }
 
+struct intel_pt_vm_tsc_info {
+	struct intel_pt_pkt pip_packet;
+	struct intel_pt_pkt vmcs_packet;
+	struct intel_pt_pkt tma_packet;
+	bool tsc, pip, vmcs, tma, psbend;
+	uint64_t ctc_delta;
+	uint64_t last_ctc;
+	int max_lookahead;
+};
+
+/* Lookahead and get the PIP, VMCS and TMA packets from PSB+ */
+static int intel_pt_vm_psb_lookahead_cb(struct intel_pt_pkt_info *pkt_info)
+{
+	struct intel_pt_vm_tsc_info *data = pkt_info->data;
+
+	switch (pkt_info->packet.type) {
+	case INTEL_PT_PAD:
+	case INTEL_PT_MNT:
+	case INTEL_PT_MODE_EXEC:
+	case INTEL_PT_MODE_TSX:
+	case INTEL_PT_MTC:
+	case INTEL_PT_FUP:
+	case INTEL_PT_CYC:
+	case INTEL_PT_CBR:
+		break;
+
+	case INTEL_PT_TSC:
+		data->tsc = true;
+		break;
+
+	case INTEL_PT_TMA:
+		data->tma_packet = pkt_info->packet;
+		data->tma = true;
+		break;
+
+	case INTEL_PT_PIP:
+		data->pip_packet = pkt_info->packet;
+		data->pip = true;
+		break;
+
+	case INTEL_PT_VMCS:
+		data->vmcs_packet = pkt_info->packet;
+		data->vmcs = true;
+		break;
+
+	case INTEL_PT_PSBEND:
+		data->psbend = true;
+		return 1;
+
+	case INTEL_PT_TIP_PGE:
+	case INTEL_PT_PTWRITE:
+	case INTEL_PT_PTWRITE_IP:
+	case INTEL_PT_EXSTOP:
+	case INTEL_PT_EXSTOP_IP:
+	case INTEL_PT_MWAIT:
+	case INTEL_PT_PWRE:
+	case INTEL_PT_PWRX:
+	case INTEL_PT_BBP:
+	case INTEL_PT_BIP:
+	case INTEL_PT_BEP:
+	case INTEL_PT_BEP_IP:
+	case INTEL_PT_OVF:
+	case INTEL_PT_BAD:
+	case INTEL_PT_TNT:
+	case INTEL_PT_TIP_PGD:
+	case INTEL_PT_TIP:
+	case INTEL_PT_PSB:
+	case INTEL_PT_TRACESTOP:
+	default:
+		return 1;
+	}
+
+	return 0;
+}
+
+struct intel_pt_ovf_fup_info {
+	int max_lookahead;
+	bool found;
+};
+
+/* Lookahead to detect a FUP packet after OVF */
+static int intel_pt_ovf_fup_lookahead_cb(struct intel_pt_pkt_info *pkt_info)
+{
+	struct intel_pt_ovf_fup_info *data = pkt_info->data;
+
+	if (pkt_info->packet.type == INTEL_PT_CYC ||
+	    pkt_info->packet.type == INTEL_PT_MTC ||
+	    pkt_info->packet.type == INTEL_PT_TSC)
+		return !--(data->max_lookahead);
+	data->found = pkt_info->packet.type == INTEL_PT_FUP;
+	return 1;
+}
+
+static bool intel_pt_ovf_fup_lookahead(struct intel_pt_decoder *decoder)
+{
+	struct intel_pt_ovf_fup_info data = {
+		.max_lookahead = 16,
+		.found = false,
+	};
+
+	intel_pt_pkt_lookahead(decoder, intel_pt_ovf_fup_lookahead_cb, &data);
+	return data.found;
+}
+
+/* Lookahead and get the TMA packet after TSC */
+static int intel_pt_tma_lookahead_cb(struct intel_pt_pkt_info *pkt_info)
+{
+	struct intel_pt_vm_tsc_info *data = pkt_info->data;
+
+	if (pkt_info->packet.type == INTEL_PT_CYC ||
+	    pkt_info->packet.type == INTEL_PT_MTC)
+		return !--(data->max_lookahead);
+
+	if (pkt_info->packet.type == INTEL_PT_TMA) {
+		data->tma_packet = pkt_info->packet;
+		data->tma = true;
+	}
+	return 1;
+}
+
+static uint64_t intel_pt_ctc_to_tsc(struct intel_pt_decoder *decoder, uint64_t ctc)
+{
+	if (decoder->tsc_ctc_mult)
+		return ctc * decoder->tsc_ctc_mult;
+	else
+		return multdiv(ctc, decoder->tsc_ctc_ratio_n, decoder->tsc_ctc_ratio_d);
+}
+
+static uint64_t intel_pt_calc_expected_tsc(struct intel_pt_decoder *decoder,
+					   uint32_t ctc,
+					   uint32_t fc,
+					   uint64_t last_ctc_timestamp,
+					   uint64_t ctc_delta,
+					   uint32_t last_ctc)
+{
+	/* Number of CTC ticks from last_ctc_timestamp to last_mtc */
+	uint64_t last_mtc_ctc = last_ctc + ctc_delta;
+	/*
+	 * Number of CTC ticks from there until current TMA packet. We would
+	 * expect last_mtc_ctc to be before ctc, but the TSC packet can slip
+	 * past an MTC, so a sign-extended value is used.
+	 */
+	uint64_t delta = (int16_t)((uint16_t)ctc - (uint16_t)last_mtc_ctc);
+	/* Total CTC ticks from last_ctc_timestamp to current TMA packet */
+	uint64_t new_ctc_delta = ctc_delta + delta;
+	uint64_t expected_tsc;
+
+	/*
+	 * Convert CTC ticks to TSC ticks, add the starting point
+	 * (last_ctc_timestamp) and the fast counter from the TMA packet.
+	 */
+	expected_tsc = last_ctc_timestamp + intel_pt_ctc_to_tsc(decoder, new_ctc_delta) + fc;
+
+	if (intel_pt_enable_logging) {
+		intel_pt_log_x64(last_mtc_ctc);
+		intel_pt_log_x32(last_ctc);
+		intel_pt_log_x64(ctc_delta);
+		intel_pt_log_x64(delta);
+		intel_pt_log_x32(ctc);
+		intel_pt_log_x64(new_ctc_delta);
+		intel_pt_log_x64(last_ctc_timestamp);
+		intel_pt_log_x32(fc);
+		intel_pt_log_x64(intel_pt_ctc_to_tsc(decoder, new_ctc_delta));
+		intel_pt_log_x64(expected_tsc);
+	}
+
+	return expected_tsc;
+}
+
+static uint64_t intel_pt_expected_tsc(struct intel_pt_decoder *decoder,
+				      struct intel_pt_vm_tsc_info *data)
+{
+	uint32_t ctc = data->tma_packet.payload;
+	uint32_t fc = data->tma_packet.count;
+
+	return intel_pt_calc_expected_tsc(decoder, ctc, fc,
+					  decoder->ctc_timestamp,
+					  data->ctc_delta, data->last_ctc);
+}
+
+static void intel_pt_translate_vm_tsc(struct intel_pt_decoder *decoder,
+				      struct intel_pt_vmcs_info *vmcs_info)
+{
+	uint64_t payload = decoder->packet.payload;
+
+	/* VMX adds the TSC Offset, so subtract to get host TSC */
+	decoder->packet.payload -= vmcs_info->tsc_offset;
+	/* TSC packet has only 7 bytes */
+	decoder->packet.payload &= SEVEN_BYTES;
+
+	/*
+	 * The buffer is mmapped from the data file, so this also updates the
+	 * data file.
+	 */
+	if (!decoder->vm_tm_corr_dry_run)
+		memcpy((void *)decoder->buf + 1, &decoder->packet.payload, 7);
+
+	intel_pt_log("Translated VM TSC %#" PRIx64 " -> %#" PRIx64
+		     "    VMCS %#" PRIx64 "    TSC Offset %#" PRIx64 "\n",
+		     payload, decoder->packet.payload, vmcs_info->vmcs,
+		     vmcs_info->tsc_offset);
+}
+
+static void intel_pt_translate_vm_tsc_offset(struct intel_pt_decoder *decoder,
+					     uint64_t tsc_offset)
+{
+	struct intel_pt_vmcs_info vmcs_info = {
+		.vmcs = NO_VMCS,
+		.tsc_offset = tsc_offset
+	};
+
+	intel_pt_translate_vm_tsc(decoder, &vmcs_info);
+}
+
+static inline bool in_vm(uint64_t pip_payload)
+{
+	return pip_payload & 1;
+}
+
+static inline bool pip_in_vm(struct intel_pt_pkt *pip_packet)
+{
+	return pip_packet->payload & 1;
+}
+
+static void intel_pt_print_vmcs_info(struct intel_pt_vmcs_info *vmcs_info)
+{
+	p_log("VMCS: %#" PRIx64 "  TSC Offset %#" PRIx64,
+	      vmcs_info->vmcs, vmcs_info->tsc_offset);
+}
+
+static void intel_pt_vm_tm_corr_psb(struct intel_pt_decoder *decoder,
+				    struct intel_pt_vm_tsc_info *data)
+{
+	memset(data, 0, sizeof(*data));
+	data->ctc_delta = decoder->ctc_delta;
+	data->last_ctc = decoder->last_ctc;
+	intel_pt_pkt_lookahead(decoder, intel_pt_vm_psb_lookahead_cb, data);
+	if (data->tsc && !data->psbend)
+		p_log("ERROR: PSB without PSBEND");
+	decoder->in_psb = data->psbend;
+}
+
+static void intel_pt_vm_tm_corr_first_tsc(struct intel_pt_decoder *decoder,
+					  struct intel_pt_vm_tsc_info *data,
+					  struct intel_pt_vmcs_info *vmcs_info,
+					  uint64_t host_tsc)
+{
+	if (!decoder->in_psb) {
+		/* Can't happen */
+		p_log("ERROR: First TSC is not in PSB+");
+	}
+
+	if (data->pip) {
+		if (pip_in_vm(&data->pip_packet)) { /* Guest */
+			if (vmcs_info && vmcs_info->tsc_offset) {
+				intel_pt_translate_vm_tsc(decoder, vmcs_info);
+				decoder->vm_tm_corr_reliable = true;
+			} else {
+				p_log("ERROR: First TSC, unknown TSC Offset");
+			}
+		} else { /* Host */
+			decoder->vm_tm_corr_reliable = true;
+		}
+	} else { /* Host or Guest */
+		decoder->vm_tm_corr_reliable = false;
+		if (intel_pt_time_in_range(decoder, host_tsc)) {
+			/* Assume Host */
+		} else {
+			/* Assume Guest */
+			if (vmcs_info && vmcs_info->tsc_offset)
+				intel_pt_translate_vm_tsc(decoder, vmcs_info);
+			else
+				p_log("ERROR: First TSC, no PIP, unknown TSC Offset");
+		}
+	}
+}
+
+static void intel_pt_vm_tm_corr_tsc(struct intel_pt_decoder *decoder,
+				    struct intel_pt_vm_tsc_info *data)
+{
+	struct intel_pt_vmcs_info *vmcs_info;
+	uint64_t tsc_offset = 0;
+	uint64_t vmcs;
+	bool reliable = true;
+	uint64_t expected_tsc;
+	uint64_t host_tsc;
+	uint64_t ref_timestamp;
+
+	bool assign = false;
+	bool assign_reliable = false;
+
+	/* Already have 'data' for the in_psb case */
+	if (!decoder->in_psb) {
+		memset(data, 0, sizeof(*data));
+		data->ctc_delta = decoder->ctc_delta;
+		data->last_ctc = decoder->last_ctc;
+		data->max_lookahead = 16;
+		intel_pt_pkt_lookahead(decoder, intel_pt_tma_lookahead_cb, data);
+		if (decoder->pge) {
+			data->pip = true;
+			data->pip_packet.payload = decoder->pip_payload;
+		}
+	}
+
+	/* Calculations depend on having TMA packets */
+	if (!data->tma) {
+		p_log("ERROR: TSC without TMA");
+		return;
+	}
+
+	vmcs = data->vmcs ? data->vmcs_packet.payload : decoder->vmcs;
+	if (vmcs == NO_VMCS)
+		vmcs = 0;
+
+	vmcs_info = decoder->findnew_vmcs_info(decoder->data, vmcs);
+
+	ref_timestamp = decoder->timestamp ? decoder->timestamp : decoder->buf_timestamp;
+	host_tsc = intel_pt_8b_tsc(decoder->packet.payload, ref_timestamp);
+
+	if (!decoder->ctc_timestamp) {
+		intel_pt_vm_tm_corr_first_tsc(decoder, data, vmcs_info, host_tsc);
+		return;
+	}
+
+	expected_tsc = intel_pt_expected_tsc(decoder, data);
+
+	tsc_offset = host_tsc - expected_tsc;
+
+	/* Determine if TSC is from Host or Guest */
+	if (data->pip) {
+		if (pip_in_vm(&data->pip_packet)) { /* Guest */
+			if (!vmcs_info) {
+				/* PIP NR=1 without VMCS cannot happen */
+				p_log("ERROR: Missing VMCS");
+				intel_pt_translate_vm_tsc_offset(decoder, tsc_offset);
+				decoder->vm_tm_corr_reliable = false;
+				return;
+			}
+		} else { /* Host */
+			decoder->last_reliable_timestamp = host_tsc;
+			decoder->vm_tm_corr_reliable = true;
+			return;
+		}
+	} else { /* Host or Guest */
+		reliable = false; /* Host/Guest is a guess, so not reliable */
+		if (decoder->in_psb) {
+			if (!tsc_offset)
+				return; /* Zero TSC Offset, assume Host */
+			/*
+			 * TSC packet has only 7 bytes of TSC. We have no
+			 * information about the Guest's 8th byte, but it
+			 * doesn't matter because we only need 7 bytes.
+			 * Here, since the 8th byte is unreliable and
+			 * irrelevant, compare only 7 byes.
+			 */
+			if (vmcs_info &&
+			    (tsc_offset & SEVEN_BYTES) ==
+			    (vmcs_info->tsc_offset & SEVEN_BYTES)) {
+				/* Same TSC Offset as last VMCS, assume Guest */
+				goto guest;
+			}
+		}
+		/*
+		 * Check if the host_tsc is within the expected range.
+		 * Note, we could narrow the range more by looking ahead for
+		 * the next host TSC in the same buffer, but we don't bother to
+		 * do that because this is probably good enough.
+		 */
+		if (host_tsc >= expected_tsc && intel_pt_time_in_range(decoder, host_tsc)) {
+			/* Within expected range for Host TSC, assume Host */
+			decoder->vm_tm_corr_reliable = false;
+			return;
+		}
+	}
+
+guest: /* Assuming Guest */
+
+	/* Determine whether to assign TSC Offset */
+	if (vmcs_info && vmcs_info->vmcs) {
+		if (vmcs_info->tsc_offset && vmcs_info->reliable) {
+			assign = false;
+		} else if (decoder->in_psb && data->pip && decoder->vm_tm_corr_reliable &&
+			   decoder->vm_tm_corr_continuous && decoder->vm_tm_corr_same_buf) {
+			/* Continuous tracing, TSC in a PSB is not a time loss */
+			assign = true;
+			assign_reliable = true;
+		} else if (decoder->in_psb && data->pip && decoder->vm_tm_corr_same_buf) {
+			/*
+			 * Unlikely to be a time loss TSC in a PSB which is not
+			 * at the start of a buffer.
+			 */
+			assign = true;
+			assign_reliable = false;
+		}
+	}
+
+	/* Record VMCS TSC Offset */
+	if (assign && (vmcs_info->tsc_offset != tsc_offset ||
+		       vmcs_info->reliable != assign_reliable)) {
+		bool print = vmcs_info->tsc_offset != tsc_offset;
+
+		vmcs_info->tsc_offset = tsc_offset;
+		vmcs_info->reliable = assign_reliable;
+		if (print)
+			intel_pt_print_vmcs_info(vmcs_info);
+	}
+
+	/* Determine what TSC Offset to use */
+	if (vmcs_info && vmcs_info->tsc_offset) {
+		if (!vmcs_info->reliable)
+			reliable = false;
+		intel_pt_translate_vm_tsc(decoder, vmcs_info);
+	} else {
+		reliable = false;
+		if (vmcs_info) {
+			if (!vmcs_info->error_printed) {
+				p_log("ERROR: Unknown TSC Offset for VMCS %#" PRIx64,
+				      vmcs_info->vmcs);
+				vmcs_info->error_printed = true;
+			}
+		} else {
+			if (intel_pt_print_once(decoder, INTEL_PT_PRT_ONCE_UNK_VMCS))
+				p_log("ERROR: Unknown VMCS");
+		}
+		intel_pt_translate_vm_tsc_offset(decoder, tsc_offset);
+	}
+
+	decoder->vm_tm_corr_reliable = reliable;
+}
+
+static void intel_pt_vm_tm_corr_pebs_tsc(struct intel_pt_decoder *decoder)
+{
+	uint64_t host_tsc = decoder->packet.payload;
+	uint64_t guest_tsc = decoder->packet.payload;
+	struct intel_pt_vmcs_info *vmcs_info;
+	uint64_t vmcs;
+
+	vmcs = decoder->vmcs;
+	if (vmcs == NO_VMCS)
+		vmcs = 0;
+
+	vmcs_info = decoder->findnew_vmcs_info(decoder->data, vmcs);
+
+	if (decoder->pge) {
+		if (in_vm(decoder->pip_payload)) { /* Guest */
+			if (!vmcs_info) {
+				/* PIP NR=1 without VMCS cannot happen */
+				p_log("ERROR: Missing VMCS");
+			}
+		} else { /* Host */
+			return;
+		}
+	} else { /* Host or Guest */
+		if (intel_pt_time_in_range(decoder, host_tsc)) {
+			/* Within expected range for Host TSC, assume Host */
+			return;
+		}
+	}
+
+	if (vmcs_info) {
+		/* Translate Guest TSC to Host TSC */
+		host_tsc = ((guest_tsc & SEVEN_BYTES) - vmcs_info->tsc_offset) & SEVEN_BYTES;
+		host_tsc = intel_pt_8b_tsc(host_tsc, decoder->timestamp);
+		intel_pt_log("Translated VM TSC %#" PRIx64 " -> %#" PRIx64
+			     "    VMCS %#" PRIx64 "    TSC Offset %#" PRIx64 "\n",
+			     guest_tsc, host_tsc, vmcs_info->vmcs,
+			     vmcs_info->tsc_offset);
+		if (!intel_pt_time_in_range(decoder, host_tsc) &&
+		    intel_pt_print_once(decoder, INTEL_PT_PRT_ONCE_ERANGE))
+			p_log("Timestamp out of range");
+	} else {
+		if (intel_pt_print_once(decoder, INTEL_PT_PRT_ONCE_UNK_VMCS))
+			p_log("ERROR: Unknown VMCS");
+		host_tsc = decoder->timestamp;
+	}
+
+	decoder->packet.payload = host_tsc;
+
+	if (!decoder->vm_tm_corr_dry_run)
+		memcpy((void *)decoder->buf + 1, &host_tsc, 8);
+}
+
+static int intel_pt_vm_time_correlation(struct intel_pt_decoder *decoder)
+{
+	struct intel_pt_vm_tsc_info data = { .psbend = false };
+	bool pge;
+	int err;
+
+	if (decoder->in_psb)
+		intel_pt_vm_tm_corr_psb(decoder, &data);
+
+	while (1) {
+		err = intel_pt_get_next_packet(decoder);
+		if (err == -ENOLINK)
+			continue;
+		if (err)
+			break;
+
+		switch (decoder->packet.type) {
+		case INTEL_PT_TIP_PGD:
+			decoder->pge = false;
+			decoder->vm_tm_corr_continuous = false;
+			break;
+
+		case INTEL_PT_TNT:
+		case INTEL_PT_TIP:
+		case INTEL_PT_TIP_PGE:
+			decoder->pge = true;
+			break;
+
+		case INTEL_PT_OVF:
+			decoder->in_psb = false;
+			pge = decoder->pge;
+			decoder->pge = intel_pt_ovf_fup_lookahead(decoder);
+			if (pge != decoder->pge)
+				intel_pt_log("Surprising PGE change in OVF!");
+			if (!decoder->pge)
+				decoder->vm_tm_corr_continuous = false;
+			break;
+
+		case INTEL_PT_FUP:
+			if (decoder->in_psb)
+				decoder->pge = true;
+			break;
+
+		case INTEL_PT_TRACESTOP:
+			decoder->pge = false;
+			decoder->vm_tm_corr_continuous = false;
+			decoder->have_tma = false;
+			break;
+
+		case INTEL_PT_PSB:
+			intel_pt_vm_tm_corr_psb(decoder, &data);
+			break;
+
+		case INTEL_PT_PIP:
+			decoder->pip_payload = decoder->packet.payload;
+			break;
+
+		case INTEL_PT_MTC:
+			intel_pt_calc_mtc_timestamp(decoder);
+			break;
+
+		case INTEL_PT_TSC:
+			intel_pt_vm_tm_corr_tsc(decoder, &data);
+			intel_pt_calc_tsc_timestamp(decoder);
+			decoder->vm_tm_corr_same_buf = true;
+			decoder->vm_tm_corr_continuous = decoder->pge;
+			break;
+
+		case INTEL_PT_TMA:
+			intel_pt_calc_tma(decoder);
+			break;
+
+		case INTEL_PT_CYC:
+			intel_pt_calc_cyc_timestamp(decoder);
+			break;
+
+		case INTEL_PT_CBR:
+			intel_pt_calc_cbr(decoder);
+			break;
+
+		case INTEL_PT_PSBEND:
+			decoder->in_psb = false;
+			data.psbend = false;
+			break;
+
+		case INTEL_PT_VMCS:
+			if (decoder->packet.payload != NO_VMCS)
+				decoder->vmcs = decoder->packet.payload;
+			break;
+
+		case INTEL_PT_BBP:
+			decoder->blk_type = decoder->packet.payload;
+			break;
+
+		case INTEL_PT_BIP:
+			if (decoder->blk_type == INTEL_PT_PEBS_BASIC &&
+			    decoder->packet.count == 2)
+				intel_pt_vm_tm_corr_pebs_tsc(decoder);
+			break;
+
+		case INTEL_PT_BEP:
+		case INTEL_PT_BEP_IP:
+			decoder->blk_type = 0;
+			break;
+
+		case INTEL_PT_MODE_EXEC:
+		case INTEL_PT_MODE_TSX:
+		case INTEL_PT_MNT:
+		case INTEL_PT_PAD:
+		case INTEL_PT_PTWRITE_IP:
+		case INTEL_PT_PTWRITE:
+		case INTEL_PT_MWAIT:
+		case INTEL_PT_PWRE:
+		case INTEL_PT_EXSTOP_IP:
+		case INTEL_PT_EXSTOP:
+		case INTEL_PT_PWRX:
+		case INTEL_PT_BAD: /* Does not happen */
+		default:
+			break;
+		}
+	}
+
+	return err;
+}
+
 #define HOP_PROCESS	0
 #define HOP_IGNORE	1
 #define HOP_RETURN	2
@@ -2898,6 +3591,15 @@ static int intel_pt_sync(struct intel_pt_decoder *decoder)
 	if (err)
 		return err;
 
+	if (decoder->vm_time_correlation) {
+		decoder->in_psb = true;
+		if (!decoder->timestamp)
+			decoder->timestamp = 1;
+		decoder->state.type = 0;
+		decoder->pkt_state = INTEL_PT_STATE_VM_TIME_CORRELATION;
+		return 0;
+	}
+
 	decoder->have_last_ip = true;
 	decoder->pkt_state = INTEL_PT_STATE_NO_IP;
 
@@ -2985,6 +3687,9 @@ const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder)
 		case INTEL_PT_STATE_RESAMPLE:
 			err = intel_pt_resample(decoder);
 			break;
+		case INTEL_PT_STATE_VM_TIME_CORRELATION:
+			err = intel_pt_vm_time_correlation(decoder);
+			break;
 		default:
 			err = intel_pt_bug(decoder);
 			break;
@@ -3231,6 +3936,7 @@ static unsigned char *adj_for_padding(unsigned char *buf_b,
  * @len_b: size of second buffer
  * @consecutive: returns true if there is data in buf_b that is consecutive
  *               to buf_a
+ * @ooo_tsc: out-of-order TSC due to VM TSC offset / scaling
  *
  * If the trace contains TSC we can look at the last TSC of @buf_a and the
  * first TSC of @buf_b in order to determine if the buffers overlap, and then
@@ -3243,7 +3949,8 @@ static unsigned char *adj_for_padding(unsigned char *buf_b,
 static unsigned char *intel_pt_find_overlap_tsc(unsigned char *buf_a,
 						size_t len_a,
 						unsigned char *buf_b,
-						size_t len_b, bool *consecutive)
+						size_t len_b, bool *consecutive,
+						bool ooo_tsc)
 {
 	uint64_t tsc_a, tsc_b;
 	unsigned char *p;
@@ -3278,7 +3985,7 @@ static unsigned char *intel_pt_find_overlap_tsc(unsigned char *buf_a,
 				start = buf_b + len_b - (rem_b - rem_a);
 				return adj_for_padding(start, buf_a, len_a);
 			}
-			if (cmp < 0)
+			if (cmp < 0 && !ooo_tsc)
 				return buf_b; /* tsc_a < tsc_b => no overlap */
 		}
 
@@ -3296,6 +4003,7 @@ static unsigned char *intel_pt_find_overlap_tsc(unsigned char *buf_a,
  * @have_tsc: can use TSC packets to detect overlap
  * @consecutive: returns true if there is data in buf_b that is consecutive
  *               to buf_a
+ * @ooo_tsc: out-of-order TSC due to VM TSC offset / scaling
  *
  * When trace samples or snapshots are recorded there is the possibility that
  * the data overlaps.  Note that, for the purposes of decoding, data is only
@@ -3306,7 +4014,8 @@ static unsigned char *intel_pt_find_overlap_tsc(unsigned char *buf_a,
  */
 unsigned char *intel_pt_find_overlap(unsigned char *buf_a, size_t len_a,
 				     unsigned char *buf_b, size_t len_b,
-				     bool have_tsc, bool *consecutive)
+				     bool have_tsc, bool *consecutive,
+				     bool ooo_tsc)
 {
 	unsigned char *found;
 
@@ -3319,7 +4028,7 @@ unsigned char *intel_pt_find_overlap(unsigned char *buf_a, size_t len_a,
 
 	if (have_tsc) {
 		found = intel_pt_find_overlap_tsc(buf_a, len_a, buf_b, len_b,
-						  consecutive);
+						  consecutive, ooo_tsc);
 		if (found)
 			return found;
 	}
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h
index d9e62a7f6f0e..714c475808c0 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h
@@ -11,6 +11,8 @@
 #include <stddef.h>
 #include <stdbool.h>
 
+#include <linux/rbtree.h>
+
 #include "intel-pt-insn-decoder.h"
 
 #define INTEL_PT_IN_TX		(1 << 0)
@@ -199,6 +201,14 @@ struct intel_pt_blk_items {
 	bool is_32_bit;
 };
 
+struct intel_pt_vmcs_info {
+	struct rb_node rb_node;
+	uint64_t vmcs;
+	uint64_t tsc_offset;
+	bool reliable;
+	bool error_printed;
+};
+
 struct intel_pt_state {
 	enum intel_pt_sample_type type;
 	bool from_nr;
@@ -244,9 +254,13 @@ struct intel_pt_params {
 			 uint64_t max_insn_cnt, void *data);
 	bool (*pgd_ip)(uint64_t ip, void *data);
 	int (*lookahead)(void *data, intel_pt_lookahead_cb_t cb, void *cb_data);
+	struct intel_pt_vmcs_info *(*findnew_vmcs_info)(void *data, uint64_t vmcs);
 	void *data;
 	bool return_compression;
 	bool branch_enable;
+	bool vm_time_correlation;
+	bool vm_tm_corr_dry_run;
+	uint64_t first_timestamp;
 	uint64_t ctl;
 	uint64_t period;
 	enum intel_pt_period_type period_type;
@@ -269,8 +283,12 @@ int intel_pt_fast_forward(struct intel_pt_decoder *decoder, uint64_t timestamp);
 
 unsigned char *intel_pt_find_overlap(unsigned char *buf_a, size_t len_a,
 				     unsigned char *buf_b, size_t len_b,
-				     bool have_tsc, bool *consecutive);
+				     bool have_tsc, bool *consecutive,
+				     bool ooo_tsc);
 
 int intel_pt__strerror(int code, char *buf, size_t buflen);
 
+void intel_pt_set_first_timestamp(struct intel_pt_decoder *decoder,
+				  uint64_t first_timestamp);
+
 #endif
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-log.h b/tools/perf/util/intel-pt-decoder/intel-pt-log.h
index 388661f89c44..d900aab24b21 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-log.h
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-log.h
@@ -67,4 +67,9 @@ static inline void intel_pt_log_to(const char *msg, uint64_t u)
 	intel_pt_log("%s to " x64_fmt "\n", msg, u);
 }
 
+#define intel_pt_log_var(var, fmt) intel_pt_log("%s: " #var " " fmt "\n", __func__, var)
+
+#define intel_pt_log_x32(var) intel_pt_log_var(var, "%#x")
+#define intel_pt_log_x64(var) intel_pt_log_var(var, "%#" PRIx64)
+
 #endif
diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 0dfec8761b9a..154a1077f22e 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -78,6 +78,7 @@ struct intel_pt {
 	u64 kernel_start;
 	u64 switch_ip;
 	u64 ptss_ip;
+	u64 first_timestamp;
 
 	struct perf_tsc_conversion tc;
 	bool cap_user_time_zero;
@@ -133,6 +134,9 @@ struct intel_pt {
 
 	struct ip_callchain *chain;
 	struct branch_stack *br_stack;
+
+	u64 dflt_tsc_offset;
+	struct rb_root vmcs_info;
 };
 
 enum switch_state {
@@ -271,6 +275,65 @@ static bool intel_pt_log_events(struct intel_pt *pt, u64 tm)
 	return !n || !perf_time__ranges_skip_sample(range, n, tm);
 }
 
+static struct intel_pt_vmcs_info *intel_pt_findnew_vmcs(struct rb_root *rb_root,
+							u64 vmcs,
+							u64 dflt_tsc_offset)
+{
+	struct rb_node **p = &rb_root->rb_node;
+	struct rb_node *parent = NULL;
+	struct intel_pt_vmcs_info *v;
+
+	while (*p) {
+		parent = *p;
+		v = rb_entry(parent, struct intel_pt_vmcs_info, rb_node);
+
+		if (v->vmcs == vmcs)
+			return v;
+
+		if (vmcs < v->vmcs)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	v = zalloc(sizeof(*v));
+	if (v) {
+		v->vmcs = vmcs;
+		v->tsc_offset = dflt_tsc_offset;
+		v->reliable = dflt_tsc_offset;
+
+		rb_link_node(&v->rb_node, parent, p);
+		rb_insert_color(&v->rb_node, rb_root);
+	}
+
+	return v;
+}
+
+static struct intel_pt_vmcs_info *intel_pt_findnew_vmcs_info(void *data, uint64_t vmcs)
+{
+	struct intel_pt_queue *ptq = data;
+	struct intel_pt *pt = ptq->pt;
+
+	if (!vmcs && !pt->dflt_tsc_offset)
+		return NULL;
+
+	return intel_pt_findnew_vmcs(&pt->vmcs_info, vmcs, pt->dflt_tsc_offset);
+}
+
+static void intel_pt_free_vmcs_info(struct intel_pt *pt)
+{
+	struct intel_pt_vmcs_info *v;
+	struct rb_node *n;
+
+	n = rb_first(&pt->vmcs_info);
+	while (n) {
+		v = rb_entry(n, struct intel_pt_vmcs_info, rb_node);
+		n = rb_next(n);
+		rb_erase(&v->rb_node, &pt->vmcs_info);
+		free(v);
+	}
+}
+
 static int intel_pt_do_fix_overlap(struct intel_pt *pt, struct auxtrace_buffer *a,
 				   struct auxtrace_buffer *b)
 {
@@ -278,9 +341,17 @@ static int intel_pt_do_fix_overlap(struct intel_pt *pt, struct auxtrace_buffer *
 	void *start;
 
 	start = intel_pt_find_overlap(a->data, a->size, b->data, b->size,
-				      pt->have_tsc, &consecutive);
+				      pt->have_tsc, &consecutive,
+				      pt->synth_opts.vm_time_correlation);
 	if (!start)
 		return -EINVAL;
+	/*
+	 * In the case of vm_time_correlation, the overlap might contain TSC
+	 * packets that will not be fixed, and that will then no longer work for
+	 * overlap detection. Avoid that by zeroing out the overlap.
+	 */
+	if (pt->synth_opts.vm_time_correlation)
+		memset(b->data, 0, start - b->data);
 	b->use_size = b->data + b->size - start;
 	b->use_data = start;
 	if (b->use_size && consecutive)
@@ -901,7 +972,7 @@ static bool intel_pt_timeless_decoding(struct intel_pt *pt)
 	bool timeless_decoding = true;
 	u64 config;
 
-	if (!pt->tsc_bit || !pt->cap_user_time_zero)
+	if (!pt->tsc_bit || !pt->cap_user_time_zero || pt->synth_opts.timeless_decoding)
 		return true;
 
 	evlist__for_each_entry(pt->session->evlist, evsel) {
@@ -949,6 +1020,19 @@ static bool intel_pt_have_tsc(struct intel_pt *pt)
 	return have_tsc;
 }
 
+static bool intel_pt_have_mtc(struct intel_pt *pt)
+{
+	struct evsel *evsel;
+	u64 config;
+
+	evlist__for_each_entry(pt->session->evlist, evsel) {
+		if (intel_pt_get_config(pt, &evsel->core.attr, &config) &&
+		    (config & pt->mtc_bit))
+			return true;
+	}
+	return false;
+}
+
 static bool intel_pt_sampling_mode(struct intel_pt *pt)
 {
 	struct evsel *evsel;
@@ -1103,6 +1187,7 @@ static struct intel_pt_queue *intel_pt_alloc_queue(struct intel_pt *pt,
 	params.get_trace = intel_pt_get_trace;
 	params.walk_insn = intel_pt_walk_next_insn;
 	params.lookahead = intel_pt_lookahead;
+	params.findnew_vmcs_info = intel_pt_findnew_vmcs_info;
 	params.data = ptq;
 	params.return_compression = intel_pt_return_compression(pt);
 	params.branch_enable = intel_pt_branch_enable(pt);
@@ -1112,6 +1197,9 @@ static struct intel_pt_queue *intel_pt_alloc_queue(struct intel_pt *pt,
 	params.tsc_ctc_ratio_n = pt->tsc_ctc_ratio_n;
 	params.tsc_ctc_ratio_d = pt->tsc_ctc_ratio_d;
 	params.quick = pt->synth_opts.quick;
+	params.vm_time_correlation = pt->synth_opts.vm_time_correlation;
+	params.vm_tm_corr_dry_run = pt->synth_opts.vm_tm_corr_dry_run;
+	params.first_timestamp = pt->first_timestamp;
 
 	if (pt->filts.cnt > 0)
 		params.pgd_ip = intel_pt_pgd_ip;
@@ -1176,6 +1264,21 @@ static void intel_pt_free_queue(void *priv)
 	free(ptq);
 }
 
+static void intel_pt_first_timestamp(struct intel_pt *pt, u64 timestamp)
+{
+	unsigned int i;
+
+	pt->first_timestamp = timestamp;
+
+	for (i = 0; i < pt->queues.nr_queues; i++) {
+		struct auxtrace_queue *queue = &pt->queues.queue_array[i];
+		struct intel_pt_queue *ptq = queue->priv;
+
+		if (ptq && ptq->decoder)
+			intel_pt_set_first_timestamp(ptq->decoder, timestamp);
+	}
+}
+
 static void intel_pt_set_pid_tid_cpu(struct intel_pt *pt,
 				     struct auxtrace_queue *queue)
 {
@@ -2379,7 +2482,7 @@ static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp)
 		if (pt->per_cpu_mmaps &&
 		    (pt->have_sched_switch == 1 || pt->have_sched_switch == 3) &&
 		    !pt->timeless_decoding && intel_pt_tracing_kernel(pt) &&
-		    !pt->sampling_mode) {
+		    !pt->sampling_mode && !pt->synth_opts.vm_time_correlation) {
 			pt->switch_ip = intel_pt_switch_ip(pt, &pt->ptss_ip);
 			if (pt->switch_ip) {
 				intel_pt_log("switch_ip: %"PRIx64" ptss_ip: %"PRIx64"\n",
@@ -2878,6 +2981,8 @@ static int intel_pt_process_event(struct perf_session *session,
 							       sample->time);
 		}
 	} else if (timestamp) {
+		if (!pt->first_timestamp)
+			intel_pt_first_timestamp(pt, timestamp);
 		err = intel_pt_process_queues(pt, timestamp);
 	}
 	if (err)
@@ -2964,6 +3069,7 @@ static void intel_pt_free(struct perf_session *session)
 	auxtrace_heap__free(&pt->heap);
 	intel_pt_free_events(session);
 	session->auxtrace = NULL;
+	intel_pt_free_vmcs_info(pt);
 	thread__put(pt->unknown_thread);
 	addr_filters__exit(&pt->filts);
 	zfree(&pt->chain);
@@ -3407,6 +3513,65 @@ static int intel_pt_setup_time_ranges(struct intel_pt *pt,
 	return 0;
 }
 
+static int intel_pt_parse_vm_tm_corr_arg(struct intel_pt *pt, char **args)
+{
+	struct intel_pt_vmcs_info *vmcs_info;
+	u64 tsc_offset, vmcs;
+	char *p = *args;
+
+	errno = 0;
+
+	p = skip_spaces(p);
+	if (!*p)
+		return 1;
+
+	tsc_offset = strtoull(p, &p, 0);
+	if (errno)
+		return -errno;
+	p = skip_spaces(p);
+	if (*p != ':') {
+		pt->dflt_tsc_offset = tsc_offset;
+		*args = p;
+		return 0;
+	}
+	while (1) {
+		vmcs = strtoull(p, &p, 0);
+		if (errno)
+			return -errno;
+		if (!vmcs)
+			return -EINVAL;
+		vmcs_info = intel_pt_findnew_vmcs(&pt->vmcs_info, vmcs, tsc_offset);
+		if (!vmcs_info)
+			return -ENOMEM;
+		p = skip_spaces(p);
+		if (*p != ',')
+			break;
+		p += 1;
+	}
+	*args = p;
+	return 0;
+}
+
+static int intel_pt_parse_vm_tm_corr_args(struct intel_pt *pt)
+{
+	char *args = pt->synth_opts.vm_tm_corr_args;
+	int ret;
+
+	if (!args)
+		return 0;
+
+	do {
+		ret = intel_pt_parse_vm_tm_corr_arg(pt, &args);
+	} while (!ret);
+
+	if (ret < 0) {
+		pr_err("Failed to parse VM Time Correlation options\n");
+		return ret;
+	}
+
+	return 0;
+}
+
 static const char * const intel_pt_info_fmts[] = {
 	[INTEL_PT_PMU_TYPE]		= "  PMU Type            %"PRId64"\n",
 	[INTEL_PT_TIME_SHIFT]		= "  Time Shift          %"PRIu64"\n",
@@ -3469,6 +3634,8 @@ int intel_pt_process_auxtrace_info(union perf_event *event,
 	if (!pt)
 		return -ENOMEM;
 
+	pt->vmcs_info = RB_ROOT;
+
 	addr_filters__init(&pt->filts);
 
 	err = perf_config(intel_pt_perf_config, pt);
@@ -3481,6 +3648,20 @@ int intel_pt_process_auxtrace_info(union perf_event *event,
 
 	intel_pt_log_set_name(INTEL_PT_PMU_NAME);
 
+	if (session->itrace_synth_opts->set) {
+		pt->synth_opts = *session->itrace_synth_opts;
+	} else {
+		struct itrace_synth_opts *opts = session->itrace_synth_opts;
+
+		itrace_synth_opts__set_default(&pt->synth_opts, opts->default_no_sample);
+		if (!opts->default_no_sample && !opts->inject) {
+			pt->synth_opts.branches = false;
+			pt->synth_opts.callchain = true;
+			pt->synth_opts.add_callchain = true;
+		}
+		pt->synth_opts.thread_stack = opts->thread_stack;
+	}
+
 	pt->session = session;
 	pt->machine = &session->machines.host; /* No kvm support */
 	pt->auxtrace_type = auxtrace_info->type;
@@ -3562,6 +3743,28 @@ int intel_pt_process_auxtrace_info(union perf_event *event,
 	pt->sampling_mode = intel_pt_sampling_mode(pt);
 	pt->est_tsc = !pt->timeless_decoding;
 
+	if (pt->synth_opts.vm_time_correlation) {
+		if (pt->timeless_decoding) {
+			pr_err("Intel PT has no time information for VM Time Correlation\n");
+			err = -EINVAL;
+			goto err_free_queues;
+		}
+		if (session->itrace_synth_opts->ptime_range) {
+			pr_err("Time ranges cannot be specified with VM Time Correlation\n");
+			err = -EINVAL;
+			goto err_free_queues;
+		}
+		/* Currently TSC Offset is calculated using MTC packets */
+		if (!intel_pt_have_mtc(pt)) {
+			pr_err("MTC packets must have been enabled for VM Time Correlation\n");
+			err = -EINVAL;
+			goto err_free_queues;
+		}
+		err = intel_pt_parse_vm_tm_corr_args(pt);
+		if (err)
+			goto err_free_queues;
+	}
+
 	pt->unknown_thread = thread__new(999999999, 999999999);
 	if (!pt->unknown_thread) {
 		err = -ENOMEM;
@@ -3611,21 +3814,6 @@ int intel_pt_process_auxtrace_info(union perf_event *event,
 		goto err_delete_thread;
 	}
 
-	if (session->itrace_synth_opts->set) {
-		pt->synth_opts = *session->itrace_synth_opts;
-	} else {
-		itrace_synth_opts__set_default(&pt->synth_opts,
-				session->itrace_synth_opts->default_no_sample);
-		if (!session->itrace_synth_opts->default_no_sample &&
-		    !session->itrace_synth_opts->inject) {
-			pt->synth_opts.branches = false;
-			pt->synth_opts.callchain = true;
-			pt->synth_opts.add_callchain = true;
-		}
-		pt->synth_opts.thread_stack =
-				session->itrace_synth_opts->thread_stack;
-	}
-
 	if (pt->synth_opts.log)
 		intel_pt_log_enable();
 
diff --git a/tools/perf/util/pmu-hybrid.h b/tools/perf/util/pmu-hybrid.h
index d0fa7bc50a76..2b186c26a43e 100644
--- a/tools/perf/util/pmu-hybrid.h
+++ b/tools/perf/util/pmu-hybrid.h
@@ -19,4 +19,15 @@ struct perf_pmu *perf_pmu__find_hybrid_pmu(const char *name);
 bool perf_pmu__is_hybrid(const char *name);
 char *perf_pmu__hybrid_type_to_pmu(const char *type);
 
+static inline int perf_pmu__hybrid_pmu_num(void)
+{
+	struct perf_pmu *pmu;
+	int num = 0;
+
+	perf_pmu__for_each_hybrid_pmu(pmu)
+		num++;
+
+	return num;
+}
+
 #endif /* __PMU_HYBRID_H */
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 106b3d60881a..0dbb4f2628f3 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -2155,6 +2155,7 @@ struct reader {
 	u64		 data_size;
 	u64		 data_offset;
 	reader_cb_t	 process;
+	bool		 in_place_update;
 };
 
 static int
@@ -2188,7 +2189,9 @@ reader__process_events(struct reader *rd, struct perf_session *session,
 	mmap_prot  = PROT_READ;
 	mmap_flags = MAP_SHARED;
 
-	if (session->header.needs_swap) {
+	if (rd->in_place_update) {
+		mmap_prot  |= PROT_WRITE;
+	} else if (session->header.needs_swap) {
 		mmap_prot  |= PROT_WRITE;
 		mmap_flags = MAP_PRIVATE;
 	}
@@ -2274,6 +2277,7 @@ static int __perf_session__process_events(struct perf_session *session)
 		.data_size	= session->header.data_size,
 		.data_offset	= session->header.data_offset,
 		.process	= process_simple,
+		.in_place_update = session->data->in_place_update,
 	};
 	struct ordered_events *oe = &session->ordered_events;
 	struct perf_tool *tool = session->tool;
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index a76fff5e7d83..b759dfd633b4 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -827,11 +827,11 @@ static void counter_aggr_cb(struct perf_stat_config *config __maybe_unused,
 			    bool first __maybe_unused)
 {
 	struct caggr_data *cd = data;
-	struct perf_stat_evsel *ps = counter->stats;
+	struct perf_counts_values *aggr = &counter->counts->aggr;
 
-	cd->avg += avg_stats(&ps->res_stats[0]);
-	cd->avg_enabled += avg_stats(&ps->res_stats[1]);
-	cd->avg_running += avg_stats(&ps->res_stats[2]);
+	cd->avg += aggr->val;
+	cd->avg_enabled += aggr->ena;
+	cd->avg_running += aggr->run;
 }
 
 /*
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 2db46b9bebd0..d3ec2624e036 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -437,18 +437,6 @@ int perf_stat_process_counter(struct perf_stat_config *config,
 
 	aggr->val = aggr->ena = aggr->run = 0;
 
-	/*
-	 * We calculate counter's data every interval,
-	 * and the display code shows ps->res_stats
-	 * avg value. We need to zero the stats for
-	 * interval mode, otherwise overall avg running
-	 * averages will be shown for each interval.
-	 */
-	if (config->interval || config->summary) {
-		for (i = 0; i < 3; i++)
-			init_stats(&ps->res_stats[i]);
-	}
-
 	if (counter->per_pkg)
 		evsel__zero_per_pkg(counter);