Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm updates from Paolo Bonzini: "ARM64: - Eager page splitting optimization for dirty logging, optionally allowing for a VM to avoid the cost of hugepage splitting in the stage-2 fault path. - Arm FF-A proxy for pKVM, allowing a pKVM host to safely interact with services that live in the Secure world. pKVM intervenes on FF-A calls to guarantee the host doesn't misuse memory donated to the hyp or a pKVM guest. - Support for running the split hypervisor with VHE enabled, known as 'hVHE' mode. This is extremely useful for testing the split hypervisor on VHE-only systems, and paves the way for new use cases that depend on having two TTBRs available at EL2. - Generalized framework for configurable ID registers from userspace. KVM/arm64 currently prevents arbitrary CPU feature set configuration from userspace, but the intent is to relax this limitation and allow userspace to select a feature set consistent with the CPU. - Enable the use of Branch Target Identification (FEAT_BTI) in the hypervisor. - Use a separate set of pointer authentication keys for the hypervisor when running in protected mode, as the host is untrusted at runtime. - Ensure timer IRQs are consistently released in the init failure paths. - Avoid trapping CTR_EL0 on systems with Enhanced Virtualization Traps (FEAT_EVT), as it is a register commonly read from userspace. - Erratum workaround for the upcoming AmpereOne part, which has broken hardware A/D state management. RISC-V: - Redirect AMO load/store misaligned traps to KVM guest - Trap-n-emulate AIA in-kernel irqchip for KVM guest - Svnapot support for KVM Guest s390: - New uvdevice secret API - CMM selftest and fixes - fix racy access to target CPU for diag 9c x86: - Fix missing/incorrect #GP checks on ENCLS - Use standard mmu_notifier hooks for handling APIC access page - Drop now unnecessary TR/TSS load after VM-Exit on AMD - Print more descriptive information about the status of SEV and SEV-ES during module load - Add a test for splitting and reconstituting hugepages during and after dirty logging - Add support for CPU pinning in demand paging test - Add support for AMD PerfMonV2, with a variety of cleanups and minor fixes included along the way - Add a "nx_huge_pages=never" option to effectively avoid creating NX hugepage recovery threads (because nx_huge_pages=off can be toggled at runtime) - Move handling of PAT out of MTRR code and dedup SVM+VMX code - Fix output of PIC poll command emulation when there's an interrupt - Add a maintainer's handbook to document KVM x86 processes, preferred coding style, testing expectations, etc. - Misc cleanups, fixes and comments Generic: - Miscellaneous bugfixes and cleanups Selftests: - Generate dependency files so that partial rebuilds work as expected" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (153 commits) Documentation/process: Add a maintainer handbook for KVM x86 Documentation/process: Add a label for the tip tree handbook's coding style KVM: arm64: Fix misuse of KVM_ARM_VCPU_POWER_OFF bit index RISC-V: KVM: Remove unneeded semicolon RISC-V: KVM: Allow Svnapot extension for Guest/VM riscv: kvm: define vcpu_sbi_ext_pmu in header RISC-V: KVM: Expose IMSIC registers as attributes of AIA irqchip RISC-V: KVM: Add in-kernel virtualization of AIA IMSIC RISC-V: KVM: Expose APLIC registers as attributes of AIA irqchip RISC-V: KVM: Add in-kernel emulation of AIA APLIC RISC-V: KVM: Implement device interface for AIA irqchip RISC-V: KVM: Skeletal in-kernel AIA irqchip support RISC-V: KVM: Set kvm_riscv_aia_nr_hgei to zero RISC-V: KVM: Add APLIC related defines RISC-V: KVM: Add IMSIC related defines RISC-V: KVM: Implement guest external interrupt line management KVM: x86: Remove PRIx* definitions as they are solely for user space s390/uv: Update query for secret-UVCs s390/uv: replace scnprintf with sysfs_emit s390/uvdevice: Add 'Lock Secret Store' UVC ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2023-07-03 15:32:22 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2023-07-03 15:32:22 -0700
commit: e8069f5a8e3bdb5fdeeff895780529388592ee7a (patch)
tree: ce35ab85db9b66a7e488707fccdb33ce54f696dd /tools
parent: eded37770c9f80ecd5ba842359c4f1058d9812c3 (diff)
parent: 255006adb3da71bb75c334453786df781b415f54 (diff)
13 files changed, 1138 insertions, 118 deletions
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 4761b768b773..c692cc86e7da 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -61,6 +61,7 @@ TEST_PROGS_x86_64 += x86_64/nx_huge_pages_test.sh
 # Compiled test targets
 TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test
 TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
+TEST_GEN_PROGS_x86_64 += x86_64/dirty_log_page_splitting_test
 TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
 TEST_GEN_PROGS_x86_64 += x86_64/exit_on_emulation_failure_test
 TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test
@@ -164,6 +165,7 @@ TEST_GEN_PROGS_s390x = s390x/memop
 TEST_GEN_PROGS_s390x += s390x/resets
 TEST_GEN_PROGS_s390x += s390x/sync_regs_test
 TEST_GEN_PROGS_s390x += s390x/tprot
+TEST_GEN_PROGS_s390x += s390x/cmma_test
 TEST_GEN_PROGS_s390x += demand_paging_test
 TEST_GEN_PROGS_s390x += dirty_log_test
 TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
@@ -184,6 +186,8 @@ TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(ARCH_DIR))
 TEST_GEN_PROGS_EXTENDED += $(TEST_GEN_PROGS_EXTENDED_$(ARCH_DIR))
 LIBKVM += $(LIBKVM_$(ARCH_DIR))
 
+OVERRIDE_TARGETS = 1
+
 # lib.mak defines $(OUTPUT), prepends $(OUTPUT)/ to $(TEST_GEN_PROGS), and most
 # importantly defines, i.e. overwrites, $(CC) (unless `make -e` or `make CC=`,
 # which causes the environment variable to override the makefile).
@@ -198,7 +202,7 @@ else
 LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include
 endif
 CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \
-	-Wno-gnu-variable-sized-type-not-at-end \
+	-Wno-gnu-variable-sized-type-not-at-end -MD\
 	-fno-builtin-memcmp -fno-builtin-memcpy -fno-builtin-memset \
 	-fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \
 	-I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \
@@ -225,7 +229,18 @@ LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S))
 LIBKVM_STRING_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_STRING))
 LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ)
 
-EXTRA_CLEAN += $(LIBKVM_OBJS) cscope.*
+TEST_GEN_OBJ = $(patsubst %, %.o, $(TEST_GEN_PROGS))
+TEST_GEN_OBJ += $(patsubst %, %.o, $(TEST_GEN_PROGS_EXTENDED))
+TEST_DEP_FILES = $(patsubst %.o, %.d, $(TEST_GEN_OBJ))
+TEST_DEP_FILES += $(patsubst %.o, %.d, $(LIBKVM_OBJS))
+-include $(TEST_DEP_FILES)
+
+$(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): %: %.o
+	$(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) $< $(LIBKVM_OBJS) $(LDLIBS) -o $@
+$(TEST_GEN_OBJ): $(OUTPUT)/%.o: %.c
+	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
+
+EXTRA_CLEAN += $(LIBKVM_OBJS) $(TEST_DEP_FILES) $(TEST_GEN_OBJ) cscope.*
 
 x := $(shell mkdir -p $(sort $(dir $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ))))
 $(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.c
diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c
index 2439c4043fed..09c116a82a84 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -128,6 +128,7 @@ static void prefault_mem(void *alias, uint64_t len)
 
 static void run_test(enum vm_guest_mode mode, void *arg)
 {
+	struct memstress_vcpu_args *vcpu_args;
 	struct test_params *p = arg;
 	struct uffd_desc **uffd_descs = NULL;
 	struct timespec start;
@@ -145,24 +146,24 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 		    "Failed to allocate buffer for guest data pattern");
 	memset(guest_data_prototype, 0xAB, demand_paging_size);
 
+	if (p->uffd_mode == UFFDIO_REGISTER_MODE_MINOR) {
+		for (i = 0; i < nr_vcpus; i++) {
+			vcpu_args = &memstress_args.vcpu_args[i];
+			prefault_mem(addr_gpa2alias(vm, vcpu_args->gpa),
+				     vcpu_args->pages * memstress_args.guest_page_size);
+		}
+	}
+
 	if (p->uffd_mode) {
 		uffd_descs = malloc(nr_vcpus * sizeof(struct uffd_desc *));
 		TEST_ASSERT(uffd_descs, "Memory allocation failed");
-
 		for (i = 0; i < nr_vcpus; i++) {
-			struct memstress_vcpu_args *vcpu_args;
 			void *vcpu_hva;
-			void *vcpu_alias;
 
 			vcpu_args = &memstress_args.vcpu_args[i];
 
 			/* Cache the host addresses of the region */
 			vcpu_hva = addr_gpa2hva(vm, vcpu_args->gpa);
-			vcpu_alias = addr_gpa2alias(vm, vcpu_args->gpa);
-
-			prefault_mem(vcpu_alias,
-				vcpu_args->pages * memstress_args.guest_page_size);
-
 			/*
 			 * Set up user fault fd to handle demand paging
 			 * requests.
@@ -207,10 +208,11 @@ static void help(char *name)
 {
 	puts("");
 	printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-d uffd_delay_usec]\n"
-	       "          [-b memory] [-s type] [-v vcpus] [-o]\n", name);
+	       "          [-b memory] [-s type] [-v vcpus] [-c cpu_list] [-o]\n", name);
 	guest_modes_help();
 	printf(" -u: use userfaultfd to handle vCPU page faults. Mode is a\n"
 	       "     UFFD registration mode: 'MISSING' or 'MINOR'.\n");
+	kvm_print_vcpu_pinning_help();
 	printf(" -d: add a delay in usec to the User Fault\n"
 	       "     FD handler to simulate demand paging\n"
 	       "     overheads. Ignored without -u.\n");
@@ -228,6 +230,7 @@ static void help(char *name)
 int main(int argc, char *argv[])
 {
 	int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
+	const char *cpulist = NULL;
 	struct test_params p = {
 		.src_type = DEFAULT_VM_MEM_SRC,
 		.partition_vcpu_memory_access = true,
@@ -236,7 +239,7 @@ int main(int argc, char *argv[])
 
 	guest_modes_append_default();
 
-	while ((opt = getopt(argc, argv, "hm:u:d:b:s:v:o")) != -1) {
+	while ((opt = getopt(argc, argv, "hm:u:d:b:s:v:c:o")) != -1) {
 		switch (opt) {
 		case 'm':
 			guest_modes_cmdline(optarg);
@@ -263,6 +266,9 @@ int main(int argc, char *argv[])
 			TEST_ASSERT(nr_vcpus <= max_vcpus,
 				    "Invalid number of vcpus, must be between 1 and %d", max_vcpus);
 			break;
+		case 'c':
+			cpulist = optarg;
+			break;
 		case 'o':
 			p.partition_vcpu_memory_access = false;
 			break;
@@ -278,6 +284,12 @@ int main(int argc, char *argv[])
 		TEST_FAIL("userfaultfd MINOR mode requires shared memory; pick a different -s");
 	}
 
+	if (cpulist) {
+		kvm_parse_vcpu_pinning(cpulist, memstress_args.vcpu_to_pcpu,
+				       nr_vcpus);
+		memstress_args.pin_vcpus = true;
+	}
+
 	for_each_guest_mode(run_test, &p);
 
 	return 0;
diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c
index e9d6d1aecf89..d374dbcf9a53 100644
--- a/tools/testing/selftests/kvm/dirty_log_perf_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c
@@ -136,77 +136,6 @@ struct test_params {
 	bool random_access;
 };
 
-static void toggle_dirty_logging(struct kvm_vm *vm, int slots, bool enable)
-{
-	int i;
-
-	for (i = 0; i < slots; i++) {
-		int slot = MEMSTRESS_MEM_SLOT_INDEX + i;
-		int flags = enable ? KVM_MEM_LOG_DIRTY_PAGES : 0;
-
-		vm_mem_region_set_flags(vm, slot, flags);
-	}
-}
-
-static inline void enable_dirty_logging(struct kvm_vm *vm, int slots)
-{
-	toggle_dirty_logging(vm, slots, true);
-}
-
-static inline void disable_dirty_logging(struct kvm_vm *vm, int slots)
-{
-	toggle_dirty_logging(vm, slots, false);
-}
-
-static void get_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], int slots)
-{
-	int i;
-
-	for (i = 0; i < slots; i++) {
-		int slot = MEMSTRESS_MEM_SLOT_INDEX + i;
-
-		kvm_vm_get_dirty_log(vm, slot, bitmaps[i]);
-	}
-}
-
-static void clear_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[],
-			    int slots, uint64_t pages_per_slot)
-{
-	int i;
-
-	for (i = 0; i < slots; i++) {
-		int slot = MEMSTRESS_MEM_SLOT_INDEX + i;
-
-		kvm_vm_clear_dirty_log(vm, slot, bitmaps[i], 0, pages_per_slot);
-	}
-}
-
-static unsigned long **alloc_bitmaps(int slots, uint64_t pages_per_slot)
-{
-	unsigned long **bitmaps;
-	int i;
-
-	bitmaps = malloc(slots * sizeof(bitmaps[0]));
-	TEST_ASSERT(bitmaps, "Failed to allocate bitmaps array.");
-
-	for (i = 0; i < slots; i++) {
-		bitmaps[i] = bitmap_zalloc(pages_per_slot);
-		TEST_ASSERT(bitmaps[i], "Failed to allocate slot bitmap.");
-	}
-
-	return bitmaps;
-}
-
-static void free_bitmaps(unsigned long *bitmaps[], int slots)
-{
-	int i;
-
-	for (i = 0; i < slots; i++)
-		free(bitmaps[i]);
-
-	free(bitmaps);
-}
-
 static void run_test(enum vm_guest_mode mode, void *arg)
 {
 	struct test_params *p = arg;
@@ -236,7 +165,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 	host_num_pages = vm_num_host_pages(mode, guest_num_pages);
 	pages_per_slot = host_num_pages / p->slots;
 
-	bitmaps = alloc_bitmaps(p->slots, pages_per_slot);
+	bitmaps = memstress_alloc_bitmaps(p->slots, pages_per_slot);
 
 	if (dirty_log_manual_caps)
 		vm_enable_cap(vm, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2,
@@ -277,7 +206,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 
 	/* Enable dirty logging */
 	clock_gettime(CLOCK_MONOTONIC, &start);
-	enable_dirty_logging(vm, p->slots);
+	memstress_enable_dirty_logging(vm, p->slots);
 	ts_diff = timespec_elapsed(start);
 	pr_info("Enabling dirty logging time: %ld.%.9lds\n\n",
 		ts_diff.tv_sec, ts_diff.tv_nsec);
@@ -306,7 +235,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 			iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
 
 		clock_gettime(CLOCK_MONOTONIC, &start);
-		get_dirty_log(vm, bitmaps, p->slots);
+		memstress_get_dirty_log(vm, bitmaps, p->slots);
 		ts_diff = timespec_elapsed(start);
 		get_dirty_log_total = timespec_add(get_dirty_log_total,
 						   ts_diff);
@@ -315,7 +244,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 
 		if (dirty_log_manual_caps) {
 			clock_gettime(CLOCK_MONOTONIC, &start);
-			clear_dirty_log(vm, bitmaps, p->slots, pages_per_slot);
+			memstress_clear_dirty_log(vm, bitmaps, p->slots,
+						  pages_per_slot);
 			ts_diff = timespec_elapsed(start);
 			clear_dirty_log_total = timespec_add(clear_dirty_log_total,
 							     ts_diff);
@@ -334,7 +264,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 
 	/* Disable dirty logging */
 	clock_gettime(CLOCK_MONOTONIC, &start);
-	disable_dirty_logging(vm, p->slots);
+	memstress_disable_dirty_logging(vm, p->slots);
 	ts_diff = timespec_elapsed(start);
 	pr_info("Disabling dirty logging time: %ld.%.9lds\n",
 		ts_diff.tv_sec, ts_diff.tv_nsec);
@@ -359,7 +289,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 			clear_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec);
 	}
 
-	free_bitmaps(bitmaps, p->slots);
+	memstress_free_bitmaps(bitmaps, p->slots);
 	arch_cleanup_vm(vm);
 	memstress_destroy_vm(vm);
 }
@@ -402,17 +332,7 @@ static void help(char *name)
 	       "     so -w X means each page has an X%% chance of writing\n"
 	       "     and a (100-X)%% chance of reading.\n"
 	       "     (default: 100 i.e. all pages are written to.)\n");
-	printf(" -c: Pin tasks to physical CPUs.  Takes a list of comma separated\n"
-	       "     values (target pCPU), one for each vCPU, plus an optional\n"
-	       "     entry for the main application task (specified via entry\n"
-	       "     <nr_vcpus + 1>).  If used, entries must be provided for all\n"
-	       "     vCPUs, i.e. pinning vCPUs is all or nothing.\n\n"
-	       "     E.g. to create 3 vCPUs, pin vCPU0=>pCPU22, vCPU1=>pCPU23,\n"
-	       "     vCPU2=>pCPU24, and pin the application task to pCPU50:\n\n"
-	       "         ./dirty_log_perf_test -v 3 -c 22,23,24,50\n\n"
-	       "     To leave the application task unpinned, drop the final entry:\n\n"
-	       "         ./dirty_log_perf_test -v 3 -c 22,23,24\n\n"
-	       "     (default: no pinning)\n");
+	kvm_print_vcpu_pinning_help();
 	puts("");
 	exit(0);
 }
diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h
index a089c356f354..07732a157ccd 100644
--- a/tools/testing/selftests/kvm/include/kvm_util_base.h
+++ b/tools/testing/selftests/kvm/include/kvm_util_base.h
@@ -733,6 +733,7 @@ static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu,
 struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm);
 
 void kvm_pin_this_task_to_pcpu(uint32_t pcpu);
+void kvm_print_vcpu_pinning_help(void);
 void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[],
 			    int nr_vcpus);
 
diff --git a/tools/testing/selftests/kvm/include/memstress.h b/tools/testing/selftests/kvm/include/memstress.h
index 72e3e358ef7b..ce4e603050ea 100644
--- a/tools/testing/selftests/kvm/include/memstress.h
+++ b/tools/testing/selftests/kvm/include/memstress.h
@@ -72,4 +72,12 @@ void memstress_guest_code(uint32_t vcpu_id);
 uint64_t memstress_nested_pages(int nr_vcpus);
 void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]);
 
+void memstress_enable_dirty_logging(struct kvm_vm *vm, int slots);
+void memstress_disable_dirty_logging(struct kvm_vm *vm, int slots);
+void memstress_get_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], int slots);
+void memstress_clear_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[],
+			       int slots, uint64_t pages_per_slot);
+unsigned long **memstress_alloc_bitmaps(int slots, uint64_t pages_per_slot);
+void memstress_free_bitmaps(unsigned long *bitmaps[], int slots);
+
 #endif /* SELFTEST_KVM_MEMSTRESS_H */
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 298c4372fb1a..9741a7ff6380 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -494,6 +494,23 @@ static uint32_t parse_pcpu(const char *cpu_str, const cpu_set_t *allowed_mask)
 	return pcpu;
 }
 
+void kvm_print_vcpu_pinning_help(void)
+{
+	const char *name = program_invocation_name;
+
+	printf(" -c: Pin tasks to physical CPUs.  Takes a list of comma separated\n"
+	       "     values (target pCPU), one for each vCPU, plus an optional\n"
+	       "     entry for the main application task (specified via entry\n"
+	       "     <nr_vcpus + 1>).  If used, entries must be provided for all\n"
+	       "     vCPUs, i.e. pinning vCPUs is all or nothing.\n\n"
+	       "     E.g. to create 3 vCPUs, pin vCPU0=>pCPU22, vCPU1=>pCPU23,\n"
+	       "     vCPU2=>pCPU24, and pin the application task to pCPU50:\n\n"
+	       "         %s -v 3 -c 22,23,24,50\n\n"
+	       "     To leave the application task unpinned, drop the final entry:\n\n"
+	       "         %s -v 3 -c 22,23,24\n\n"
+	       "     (default: no pinning)\n", name, name);
+}
+
 void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[],
 			    int nr_vcpus)
 {
diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c
index 5f1d3173c238..df457452d146 100644
--- a/tools/testing/selftests/kvm/lib/memstress.c
+++ b/tools/testing/selftests/kvm/lib/memstress.c
@@ -5,6 +5,7 @@
 #define _GNU_SOURCE
 
 #include <inttypes.h>
+#include <linux/bitmap.h>
 
 #include "kvm_util.h"
 #include "memstress.h"
@@ -64,6 +65,9 @@ void memstress_guest_code(uint32_t vcpu_idx)
 	GUEST_ASSERT(vcpu_args->vcpu_idx == vcpu_idx);
 
 	while (true) {
+		for (i = 0; i < sizeof(memstress_args); i += args->guest_page_size)
+			(void) *((volatile char *)args + i);
+
 		for (i = 0; i < pages; i++) {
 			if (args->random_access)
 				page = guest_random_u32(&rand_state) % pages;
@@ -320,3 +324,74 @@ void memstress_join_vcpu_threads(int nr_vcpus)
 	for (i = 0; i < nr_vcpus; i++)
 		pthread_join(vcpu_threads[i].thread, NULL);
 }
+
+static void toggle_dirty_logging(struct kvm_vm *vm, int slots, bool enable)
+{
+	int i;
+
+	for (i = 0; i < slots; i++) {
+		int slot = MEMSTRESS_MEM_SLOT_INDEX + i;
+		int flags = enable ? KVM_MEM_LOG_DIRTY_PAGES : 0;
+
+		vm_mem_region_set_flags(vm, slot, flags);
+	}
+}
+
+void memstress_enable_dirty_logging(struct kvm_vm *vm, int slots)
+{
+	toggle_dirty_logging(vm, slots, true);
+}
+
+void memstress_disable_dirty_logging(struct kvm_vm *vm, int slots)
+{
+	toggle_dirty_logging(vm, slots, false);
+}
+
+void memstress_get_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], int slots)
+{
+	int i;
+
+	for (i = 0; i < slots; i++) {
+		int slot = MEMSTRESS_MEM_SLOT_INDEX + i;
+
+		kvm_vm_get_dirty_log(vm, slot, bitmaps[i]);
+	}
+}
+
+void memstress_clear_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[],
+			       int slots, uint64_t pages_per_slot)
+{
+	int i;
+
+	for (i = 0; i < slots; i++) {
+		int slot = MEMSTRESS_MEM_SLOT_INDEX + i;
+
+		kvm_vm_clear_dirty_log(vm, slot, bitmaps[i], 0, pages_per_slot);
+	}
+}
+
+unsigned long **memstress_alloc_bitmaps(int slots, uint64_t pages_per_slot)
+{
+	unsigned long **bitmaps;
+	int i;
+
+	bitmaps = malloc(slots * sizeof(bitmaps[0]));
+	TEST_ASSERT(bitmaps, "Failed to allocate bitmaps array.");
+
+	for (i = 0; i < slots; i++) {
+		bitmaps[i] = bitmap_zalloc(pages_per_slot);
+		TEST_ASSERT(bitmaps[i], "Failed to allocate slot bitmap.");
+	}
+
+	return bitmaps;
+}
+
+void memstress_free_bitmaps(unsigned long *bitmaps[], int slots)
+{
+	int i;
+
+	for (i = 0; i < slots; i++)
+		free(bitmaps[i]);
+
+	free(bitmaps);
+}
diff --git a/tools/testing/selftests/kvm/lib/userfaultfd_util.c b/tools/testing/selftests/kvm/lib/userfaultfd_util.c
index 92cef20902f1..271f63891581 100644
--- a/tools/testing/selftests/kvm/lib/userfaultfd_util.c
+++ b/tools/testing/selftests/kvm/lib/userfaultfd_util.c
@@ -70,7 +70,7 @@ static void *uffd_handler_thread_fn(void *arg)
 			r = read(pollfd[1].fd, &tmp_chr, 1);
 			TEST_ASSERT(r == 1,
 				    "Error reading pipefd in UFFD thread\n");
-			return NULL;
+			break;
 		}
 
 		if (!(pollfd[0].revents & POLLIN))
@@ -103,7 +103,7 @@ static void *uffd_handler_thread_fn(void *arg)
 	ts_diff = timespec_elapsed(start);
 	PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n",
 		       pages, ts_diff.tv_sec, ts_diff.tv_nsec,
-		       pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
+		       pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / NSEC_PER_SEC));
 
 	return NULL;
 }
diff --git a/tools/testing/selftests/kvm/s390x/cmma_test.c b/tools/testing/selftests/kvm/s390x/cmma_test.c
new file mode 100644
index 000000000000..1d73e78e8fa7
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390x/cmma_test.c
@@ -0,0 +1,700 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test for s390x CMMA migration
+ *
+ * Copyright IBM Corp. 2023
+ *
+ * Authors:
+ *  Nico Boehr <nrb@linux.ibm.com>
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "kselftest.h"
+
+#define MAIN_PAGE_COUNT 512
+
+#define TEST_DATA_PAGE_COUNT 512
+#define TEST_DATA_MEMSLOT 1
+#define TEST_DATA_START_GFN 4096
+
+#define TEST_DATA_TWO_PAGE_COUNT 256
+#define TEST_DATA_TWO_MEMSLOT 2
+#define TEST_DATA_TWO_START_GFN 8192
+
+static char cmma_value_buf[MAIN_PAGE_COUNT + TEST_DATA_PAGE_COUNT];
+
+/**
+ * Dirty CMMA attributes of exactly one page in the TEST_DATA memslot,
+ * so use_cmma goes on and the CMMA related ioctls do something.
+ */
+static void guest_do_one_essa(void)
+{
+	asm volatile(
+		/* load TEST_DATA_START_GFN into r1 */
+		"	llilf 1,%[start_gfn]\n"
+		/* calculate the address from the gfn */
+		"	sllg 1,1,12(0)\n"
+		/* set the first page in TEST_DATA memslot to STABLE */
+		"	.insn rrf,0xb9ab0000,2,1,1,0\n"
+		/* hypercall */
+		"	diag 0,0,0x501\n"
+		"0:	j 0b"
+		:
+		: [start_gfn] "L"(TEST_DATA_START_GFN)
+		: "r1", "r2", "memory", "cc"
+	);
+}
+
+/**
+ * Touch CMMA attributes of all pages in TEST_DATA memslot. Set them to stable
+ * state.
+ */
+static void guest_dirty_test_data(void)
+{
+	asm volatile(
+		/* r1 = TEST_DATA_START_GFN */
+		"	xgr 1,1\n"
+		"	llilf 1,%[start_gfn]\n"
+		/* r5 = TEST_DATA_PAGE_COUNT */
+		"	lghi 5,%[page_count]\n"
+		/* r5 += r1 */
+		"2:	agfr 5,1\n"
+		/* r2 = r1 << 12 */
+		"1:	sllg 2,1,12(0)\n"
+		/* essa(r4, r2, SET_STABLE) */
+		"	.insn rrf,0xb9ab0000,4,2,1,0\n"
+		/* i++ */
+		"	agfi 1,1\n"
+		/* if r1 < r5 goto 1 */
+		"	cgrjl 1,5,1b\n"
+		/* hypercall */
+		"	diag 0,0,0x501\n"
+		"0:	j 0b"
+		:
+		: [start_gfn] "L"(TEST_DATA_START_GFN),
+		  [page_count] "L"(TEST_DATA_PAGE_COUNT)
+		:
+			/* the counter in our loop over the pages */
+			"r1",
+			/* the calculated page physical address */
+			"r2",
+			/* ESSA output register */
+			"r4",
+			/* last page */
+			"r5",
+			"cc", "memory"
+	);
+}
+
+static struct kvm_vm *create_vm(void)
+{
+	return ____vm_create(VM_MODE_DEFAULT);
+}
+
+static void create_main_memslot(struct kvm_vm *vm)
+{
+	int i;
+
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, MAIN_PAGE_COUNT, 0);
+	/* set the array of memslots to zero like __vm_create does */
+	for (i = 0; i < NR_MEM_REGIONS; i++)
+		vm->memslots[i] = 0;
+}
+
+static void create_test_memslot(struct kvm_vm *vm)
+{
+	vm_userspace_mem_region_add(vm,
+				    VM_MEM_SRC_ANONYMOUS,
+				    TEST_DATA_START_GFN << vm->page_shift,
+				    TEST_DATA_MEMSLOT,
+				    TEST_DATA_PAGE_COUNT,
+				    0
+				   );
+	vm->memslots[MEM_REGION_TEST_DATA] = TEST_DATA_MEMSLOT;
+}
+
+static void create_memslots(struct kvm_vm *vm)
+{
+	/*
+	 * Our VM has the following memory layout:
+	 * +------+---------------------------+
+	 * | GFN  | Memslot                   |
+	 * +------+---------------------------+
+	 * | 0    |                           |
+	 * | ...  | MAIN (Code, Stack, ...)   |
+	 * | 511  |                           |
+	 * +------+---------------------------+
+	 * | 4096 |                           |
+	 * | ...  | TEST_DATA                 |
+	 * | 4607 |                           |
+	 * +------+---------------------------+
+	 */
+	create_main_memslot(vm);
+	create_test_memslot(vm);
+}
+
+static void finish_vm_setup(struct kvm_vm *vm)
+{
+	struct userspace_mem_region *slot0;
+
+	kvm_vm_elf_load(vm, program_invocation_name);
+
+	slot0 = memslot2region(vm, 0);
+	ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
+
+	kvm_arch_vm_post_create(vm);
+}
+
+static struct kvm_vm *create_vm_two_memslots(void)
+{
+	struct kvm_vm *vm;
+
+	vm = create_vm();
+
+	create_memslots(vm);
+
+	finish_vm_setup(vm);
+
+	return vm;
+}
+
+static void enable_cmma(struct kvm_vm *vm)
+{
+	int r;
+
+	r = __kvm_device_attr_set(vm->fd, KVM_S390_VM_MEM_CTRL, KVM_S390_VM_MEM_ENABLE_CMMA, NULL);
+	TEST_ASSERT(!r, "enabling cmma failed r=%d errno=%d", r, errno);
+}
+
+static void enable_dirty_tracking(struct kvm_vm *vm)
+{
+	vm_mem_region_set_flags(vm, 0, KVM_MEM_LOG_DIRTY_PAGES);
+	vm_mem_region_set_flags(vm, TEST_DATA_MEMSLOT, KVM_MEM_LOG_DIRTY_PAGES);
+}
+
+static int __enable_migration_mode(struct kvm_vm *vm)
+{
+	return __kvm_device_attr_set(vm->fd,
+				     KVM_S390_VM_MIGRATION,
+				     KVM_S390_VM_MIGRATION_START,
+				     NULL
+				    );
+}
+
+static void enable_migration_mode(struct kvm_vm *vm)
+{
+	int r = __enable_migration_mode(vm);
+
+	TEST_ASSERT(!r, "enabling migration mode failed r=%d errno=%d", r, errno);
+}
+
+static bool is_migration_mode_on(struct kvm_vm *vm)
+{
+	u64 out;
+	int r;
+
+	r = __kvm_device_attr_get(vm->fd,
+				  KVM_S390_VM_MIGRATION,
+				  KVM_S390_VM_MIGRATION_STATUS,
+				  &out
+				 );
+	TEST_ASSERT(!r, "getting migration mode status failed r=%d errno=%d", r, errno);
+	return out;
+}
+
+static int vm_get_cmma_bits(struct kvm_vm *vm, u64 flags, int *errno_out)
+{
+	struct kvm_s390_cmma_log args;
+	int rc;
+
+	errno = 0;
+
+	args = (struct kvm_s390_cmma_log){
+		.start_gfn = 0,
+		.count = sizeof(cmma_value_buf),
+		.flags = flags,
+		.values = (__u64)&cmma_value_buf[0]
+	};
+	rc = __vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args);
+
+	*errno_out = errno;
+	return rc;
+}
+
+static void test_get_cmma_basic(void)
+{
+	struct kvm_vm *vm = create_vm_two_memslots();
+	struct kvm_vcpu *vcpu;
+	int rc, errno_out;
+
+	/* GET_CMMA_BITS without CMMA enabled should fail */
+	rc = vm_get_cmma_bits(vm, 0, &errno_out);
+	ASSERT_EQ(rc, -1);
+	ASSERT_EQ(errno_out, ENXIO);
+
+	enable_cmma(vm);
+	vcpu = vm_vcpu_add(vm, 1, guest_do_one_essa);
+
+	vcpu_run(vcpu);
+
+	/* GET_CMMA_BITS without migration mode and without peeking should fail */
+	rc = vm_get_cmma_bits(vm, 0, &errno_out);
+	ASSERT_EQ(rc, -1);
+	ASSERT_EQ(errno_out, EINVAL);
+
+	/* GET_CMMA_BITS without migration mode and with peeking should work */
+	rc = vm_get_cmma_bits(vm, KVM_S390_CMMA_PEEK, &errno_out);
+	ASSERT_EQ(rc, 0);
+	ASSERT_EQ(errno_out, 0);
+
+	enable_dirty_tracking(vm);
+	enable_migration_mode(vm);
+
+	/* GET_CMMA_BITS with invalid flags */
+	rc = vm_get_cmma_bits(vm, 0xfeedc0fe, &errno_out);
+	ASSERT_EQ(rc, -1);
+	ASSERT_EQ(errno_out, EINVAL);
+
+	kvm_vm_free(vm);
+}
+
+static void assert_exit_was_hypercall(struct kvm_vcpu *vcpu)
+{
+	ASSERT_EQ(vcpu->run->exit_reason, 13);
+	ASSERT_EQ(vcpu->run->s390_sieic.icptcode, 4);
+	ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x8300);
+	ASSERT_EQ(vcpu->run->s390_sieic.ipb, 0x5010000);
+}
+
+static void test_migration_mode(void)
+{
+	struct kvm_vm *vm = create_vm();
+	struct kvm_vcpu *vcpu;
+	u64 orig_psw;
+	int rc;
+
+	/* enabling migration mode on a VM without memory should fail */
+	rc = __enable_migration_mode(vm);
+	ASSERT_EQ(rc, -1);
+	ASSERT_EQ(errno, EINVAL);
+	TEST_ASSERT(!is_migration_mode_on(vm), "migration mode should still be off");
+	errno = 0;
+
+	create_memslots(vm);
+	finish_vm_setup(vm);
+
+	enable_cmma(vm);
+	vcpu = vm_vcpu_add(vm, 1, guest_do_one_essa);
+	orig_psw = vcpu->run->psw_addr;
+
+	/*
+	 * Execute one essa instruction in the guest. Otherwise the guest will
+	 * not have use_cmm enabled and GET_CMMA_BITS will return no pages.
+	 */
+	vcpu_run(vcpu);
+	assert_exit_was_hypercall(vcpu);
+
+	/* migration mode when memslots have dirty tracking off should fail */
+	rc = __enable_migration_mode(vm);
+	ASSERT_EQ(rc, -1);
+	ASSERT_EQ(errno, EINVAL);
+	TEST_ASSERT(!is_migration_mode_on(vm), "migration mode should still be off");
+	errno = 0;
+
+	/* enable dirty tracking */
+	enable_dirty_tracking(vm);
+
+	/* enabling migration mode should work now */
+	rc = __enable_migration_mode(vm);
+	ASSERT_EQ(rc, 0);
+	TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on");
+	errno = 0;
+
+	/* execute another ESSA instruction to see this goes fine */
+	vcpu->run->psw_addr = orig_psw;
+	vcpu_run(vcpu);
+	assert_exit_was_hypercall(vcpu);
+
+	/*
+	 * With migration mode on, create a new memslot with dirty tracking off.
+	 * This should turn off migration mode.
+	 */
+	TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on");
+	vm_userspace_mem_region_add(vm,
+				    VM_MEM_SRC_ANONYMOUS,
+				    TEST_DATA_TWO_START_GFN << vm->page_shift,
+				    TEST_DATA_TWO_MEMSLOT,
+				    TEST_DATA_TWO_PAGE_COUNT,
+				    0
+				   );
+	TEST_ASSERT(!is_migration_mode_on(vm),
+		    "creating memslot without dirty tracking turns off migration mode"
+		   );
+
+	/* ESSA instructions should still execute fine */
+	vcpu->run->psw_addr = orig_psw;
+	vcpu_run(vcpu);
+	assert_exit_was_hypercall(vcpu);
+
+	/*
+	 * Turn on dirty tracking on the new memslot.
+	 * It should be possible to turn migration mode back on again.
+	 */
+	vm_mem_region_set_flags(vm, TEST_DATA_TWO_MEMSLOT, KVM_MEM_LOG_DIRTY_PAGES);
+	rc = __enable_migration_mode(vm);
+	ASSERT_EQ(rc, 0);
+	TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on");
+	errno = 0;
+
+	/*
+	 * Turn off dirty tracking again, this time with just a flag change.
+	 * Again, migration mode should turn off.
+	 */
+	TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on");
+	vm_mem_region_set_flags(vm, TEST_DATA_TWO_MEMSLOT, 0);
+	TEST_ASSERT(!is_migration_mode_on(vm),
+		    "disabling dirty tracking should turn off migration mode"
+		   );
+
+	/* ESSA instructions should still execute fine */
+	vcpu->run->psw_addr = orig_psw;
+	vcpu_run(vcpu);
+	assert_exit_was_hypercall(vcpu);
+
+	kvm_vm_free(vm);
+}
+
+/**
+ * Given a VM with the MAIN and TEST_DATA memslot, assert that both slots have
+ * CMMA attributes of all pages in both memslots and nothing more dirty.
+ * This has the useful side effect of ensuring nothing is CMMA dirty after this
+ * function.
+ */
+static void assert_all_slots_cmma_dirty(struct kvm_vm *vm)
+{
+	struct kvm_s390_cmma_log args;
+
+	/*
+	 * First iteration - everything should be dirty.
+	 * Start at the main memslot...
+	 */
+	args = (struct kvm_s390_cmma_log){
+		.start_gfn = 0,
+		.count = sizeof(cmma_value_buf),
+		.flags = 0,
+		.values = (__u64)&cmma_value_buf[0]
+	};
+	memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf));
+	vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args);
+	ASSERT_EQ(args.count, MAIN_PAGE_COUNT);
+	ASSERT_EQ(args.remaining, TEST_DATA_PAGE_COUNT);
+	ASSERT_EQ(args.start_gfn, 0);
+
+	/* ...and then - after a hole - the TEST_DATA memslot should follow */
+	args = (struct kvm_s390_cmma_log){
+		.start_gfn = MAIN_PAGE_COUNT,
+		.count = sizeof(cmma_value_buf),
+		.flags = 0,
+		.values = (__u64)&cmma_value_buf[0]
+	};
+	memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf));
+	vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args);
+	ASSERT_EQ(args.count, TEST_DATA_PAGE_COUNT);
+	ASSERT_EQ(args.start_gfn, TEST_DATA_START_GFN);
+	ASSERT_EQ(args.remaining, 0);
+
+	/* ...and nothing else should be there */
+	args = (struct kvm_s390_cmma_log){
+		.start_gfn = TEST_DATA_START_GFN + TEST_DATA_PAGE_COUNT,
+		.count = sizeof(cmma_value_buf),
+		.flags = 0,
+		.values = (__u64)&cmma_value_buf[0]
+	};
+	memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf));
+	vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args);
+	ASSERT_EQ(args.count, 0);
+	ASSERT_EQ(args.start_gfn, 0);
+	ASSERT_EQ(args.remaining, 0);
+}
+
+/**
+ * Given a VM, assert no pages are CMMA dirty.
+ */
+static void assert_no_pages_cmma_dirty(struct kvm_vm *vm)
+{
+	struct kvm_s390_cmma_log args;
+
+	/* If we start from GFN 0 again, nothing should be dirty. */
+	args = (struct kvm_s390_cmma_log){
+		.start_gfn = 0,
+		.count = sizeof(cmma_value_buf),
+		.flags = 0,
+		.values = (__u64)&cmma_value_buf[0]
+	};
+	memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf));
+	vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args);
+	if (args.count || args.remaining || args.start_gfn)
+		TEST_FAIL("pages are still dirty start_gfn=0x%llx count=%u remaining=%llu",
+			  args.start_gfn,
+			  args.count,
+			  args.remaining
+			 );
+}
+
+static void test_get_inital_dirty(void)
+{
+	struct kvm_vm *vm = create_vm_two_memslots();
+	struct kvm_vcpu *vcpu;
+
+	enable_cmma(vm);
+	vcpu = vm_vcpu_add(vm, 1, guest_do_one_essa);
+
+	/*
+	 * Execute one essa instruction in the guest. Otherwise the guest will
+	 * not have use_cmm enabled and GET_CMMA_BITS will return no pages.
+	 */
+	vcpu_run(vcpu);
+	assert_exit_was_hypercall(vcpu);
+
+	enable_dirty_tracking(vm);
+	enable_migration_mode(vm);
+
+	assert_all_slots_cmma_dirty(vm);
+
+	/* Start from the beginning again and make sure nothing else is dirty */
+	assert_no_pages_cmma_dirty(vm);
+
+	kvm_vm_free(vm);
+}
+
+static void query_cmma_range(struct kvm_vm *vm,
+			     u64 start_gfn, u64 gfn_count,
+			     struct kvm_s390_cmma_log *res_out)
+{
+	*res_out = (struct kvm_s390_cmma_log){
+		.start_gfn = start_gfn,
+		.count = gfn_count,
+		.flags = 0,
+		.values = (__u64)&cmma_value_buf[0]
+	};
+	memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf));
+	vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, res_out);
+}
+
+/**
+ * Assert the given cmma_log struct that was executed by query_cmma_range()
+ * indicates the first dirty gfn is at first_dirty_gfn and contains exactly
+ * dirty_gfn_count CMMA values.
+ */
+static void assert_cmma_dirty(u64 first_dirty_gfn,
+			      u64 dirty_gfn_count,
+			      const struct kvm_s390_cmma_log *res)
+{
+	ASSERT_EQ(res->start_gfn, first_dirty_gfn);
+	ASSERT_EQ(res->count, dirty_gfn_count);
+	for (size_t i = 0; i < dirty_gfn_count; i++)
+		ASSERT_EQ(cmma_value_buf[0], 0x0); /* stable state */
+	ASSERT_EQ(cmma_value_buf[dirty_gfn_count], 0xff); /* not touched */
+}
+
+static void test_get_skip_holes(void)
+{
+	size_t gfn_offset;
+	struct kvm_vm *vm = create_vm_two_memslots();
+	struct kvm_s390_cmma_log log;
+	struct kvm_vcpu *vcpu;
+	u64 orig_psw;
+
+	enable_cmma(vm);
+	vcpu = vm_vcpu_add(vm, 1, guest_dirty_test_data);
+
+	orig_psw = vcpu->run->psw_addr;
+
+	/*
+	 * Execute some essa instructions in the guest. Otherwise the guest will
+	 * not have use_cmm enabled and GET_CMMA_BITS will return no pages.
+	 */
+	vcpu_run(vcpu);
+	assert_exit_was_hypercall(vcpu);
+
+	enable_dirty_tracking(vm);
+	enable_migration_mode(vm);
+
+	/* un-dirty all pages */
+	assert_all_slots_cmma_dirty(vm);
+
+	/* Then, dirty just the TEST_DATA memslot */
+	vcpu->run->psw_addr = orig_psw;
+	vcpu_run(vcpu);
+
+	gfn_offset = TEST_DATA_START_GFN;
+	/**
+	 * Query CMMA attributes of one page, starting at page 0. Since the
+	 * main memslot was not touched by the VM, this should yield the first
+	 * page of the TEST_DATA memslot.
+	 * The dirty bitmap should now look like this:
+	 * 0: not dirty
+	 * [0x1, 0x200): dirty
+	 */
+	query_cmma_range(vm, 0, 1, &log);
+	assert_cmma_dirty(gfn_offset, 1, &log);
+	gfn_offset++;
+
+	/**
+	 * Query CMMA attributes of 32 (0x20) pages past the end of the TEST_DATA
+	 * memslot. This should wrap back to the beginning of the TEST_DATA
+	 * memslot, page 1.
+	 * The dirty bitmap should now look like this:
+	 * [0, 0x21): not dirty
+	 * [0x21, 0x200): dirty
+	 */
+	query_cmma_range(vm, TEST_DATA_START_GFN + TEST_DATA_PAGE_COUNT, 0x20, &log);
+	assert_cmma_dirty(gfn_offset, 0x20, &log);
+	gfn_offset += 0x20;
+
+	/* Skip 32 pages */
+	gfn_offset += 0x20;
+
+	/**
+	 * After skipping 32 pages, query the next 32 (0x20) pages.
+	 * The dirty bitmap should now look like this:
+	 * [0, 0x21): not dirty
+	 * [0x21, 0x41): dirty
+	 * [0x41, 0x61): not dirty
+	 * [0x61, 0x200): dirty
+	 */
+	query_cmma_range(vm, gfn_offset, 0x20, &log);
+	assert_cmma_dirty(gfn_offset, 0x20, &log);
+	gfn_offset += 0x20;
+
+	/**
+	 * Query 1 page from the beginning of the TEST_DATA memslot. This should
+	 * yield page 0x21.
+	 * The dirty bitmap should now look like this:
+	 * [0, 0x22): not dirty
+	 * [0x22, 0x41): dirty
+	 * [0x41, 0x61): not dirty
+	 * [0x61, 0x200): dirty
+	 */
+	query_cmma_range(vm, TEST_DATA_START_GFN, 1, &log);
+	assert_cmma_dirty(TEST_DATA_START_GFN + 0x21, 1, &log);
+	gfn_offset++;
+
+	/**
+	 * Query 15 (0xF) pages from page 0x23 in TEST_DATA memslot.
+	 * This should yield pages [0x23, 0x33).
+	 * The dirty bitmap should now look like this:
+	 * [0, 0x22): not dirty
+	 * 0x22: dirty
+	 * [0x23, 0x33): not dirty
+	 * [0x33, 0x41): dirty
+	 * [0x41, 0x61): not dirty
+	 * [0x61, 0x200): dirty
+	 */
+	gfn_offset = TEST_DATA_START_GFN + 0x23;
+	query_cmma_range(vm, gfn_offset, 15, &log);
+	assert_cmma_dirty(gfn_offset, 15, &log);
+
+	/**
+	 * Query 17 (0x11) pages from page 0x22 in TEST_DATA memslot.
+	 * This should yield page [0x22, 0x33)
+	 * The dirty bitmap should now look like this:
+	 * [0, 0x33): not dirty
+	 * [0x33, 0x41): dirty
+	 * [0x41, 0x61): not dirty
+	 * [0x61, 0x200): dirty
+	 */
+	gfn_offset = TEST_DATA_START_GFN + 0x22;
+	query_cmma_range(vm, gfn_offset, 17, &log);
+	assert_cmma_dirty(gfn_offset, 17, &log);
+
+	/**
+	 * Query 25 (0x19) pages from page 0x40 in TEST_DATA memslot.
+	 * This should yield page 0x40 and nothing more, since there are more
+	 * than 16 non-dirty pages after page 0x40.
+	 * The dirty bitmap should now look like this:
+	 * [0, 0x33): not dirty
+	 * [0x33, 0x40): dirty
+	 * [0x40, 0x61): not dirty
+	 * [0x61, 0x200): dirty
+	 */
+	gfn_offset = TEST_DATA_START_GFN + 0x40;
+	query_cmma_range(vm, gfn_offset, 25, &log);
+	assert_cmma_dirty(gfn_offset, 1, &log);
+
+	/**
+	 * Query pages [0x33, 0x40).
+	 * The dirty bitmap should now look like this:
+	 * [0, 0x61): not dirty
+	 * [0x61, 0x200): dirty
+	 */
+	gfn_offset = TEST_DATA_START_GFN + 0x33;
+	query_cmma_range(vm, gfn_offset, 0x40 - 0x33, &log);
+	assert_cmma_dirty(gfn_offset, 0x40 - 0x33, &log);
+
+	/**
+	 * Query the remaining pages [0x61, 0x200).
+	 */
+	gfn_offset = TEST_DATA_START_GFN;
+	query_cmma_range(vm, gfn_offset, TEST_DATA_PAGE_COUNT - 0x61, &log);
+	assert_cmma_dirty(TEST_DATA_START_GFN + 0x61, TEST_DATA_PAGE_COUNT - 0x61, &log);
+
+	assert_no_pages_cmma_dirty(vm);
+}
+
+struct testdef {
+	const char *name;
+	void (*test)(void);
+} testlist[] = {
+	{ "migration mode and dirty tracking", test_migration_mode },
+	{ "GET_CMMA_BITS: basic calls", test_get_cmma_basic },
+	{ "GET_CMMA_BITS: all pages are dirty initally", test_get_inital_dirty },
+	{ "GET_CMMA_BITS: holes are skipped", test_get_skip_holes },
+};
+
+/**
+ * The kernel may support CMMA, but the machine may not (i.e. if running as
+ * guest-3).
+ *
+ * In this case, the CMMA capabilities are all there, but the CMMA-related
+ * ioctls fail. To find out whether the machine supports CMMA, create a
+ * temporary VM and then query the CMMA feature of the VM.
+ */
+static int machine_has_cmma(void)
+{
+	struct kvm_vm *vm = create_vm();
+	int r;
+
+	r = !__kvm_has_device_attr(vm->fd, KVM_S390_VM_MEM_CTRL, KVM_S390_VM_MEM_ENABLE_CMMA);
+	kvm_vm_free(vm);
+
+	return r;
+}
+
+int main(int argc, char *argv[])
+{
+	int idx;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_SYNC_REGS));
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_CMMA_MIGRATION));
+	TEST_REQUIRE(machine_has_cmma());
+
+	ksft_print_header();
+
+	ksft_set_plan(ARRAY_SIZE(testlist));
+
+	for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) {
+		testlist[idx].test();
+		ksft_test_result_pass("%s\n", testlist[idx].name);
+	}
+
+	ksft_finished();	/* Print results and exit() accordingly */
+}
diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c
index 2fc3ad9c887e..d3c3aa93f090 100644
--- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c
+++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c
@@ -163,6 +163,25 @@ static void set_cpuid_after_run(struct kvm_vcpu *vcpu)
 	ent->eax = eax;
 }
 
+static void test_get_cpuid2(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(vcpu->cpuid->nent + 1);
+	int i, r;
+
+	vcpu_ioctl(vcpu, KVM_GET_CPUID2, cpuid);
+	TEST_ASSERT(cpuid->nent == vcpu->cpuid->nent,
+		    "KVM didn't update nent on success, wanted %u, got %u\n",
+		    vcpu->cpuid->nent, cpuid->nent);
+
+	for (i = 0; i < vcpu->cpuid->nent; i++) {
+		cpuid->nent = i;
+		r = __vcpu_ioctl(vcpu, KVM_GET_CPUID2, cpuid);
+		TEST_ASSERT(r && errno == E2BIG, KVM_IOCTL_ERROR(KVM_GET_CPUID2, r));
+		TEST_ASSERT(cpuid->nent == i, "KVM modified nent on failure");
+	}
+	free(cpuid);
+}
+
 int main(void)
 {
 	struct kvm_vcpu *vcpu;
@@ -183,5 +202,7 @@ int main(void)
 
 	set_cpuid_after_run(vcpu);
 
+	test_get_cpuid2(vcpu);
+
 	kvm_vm_free(vm);
 }
diff --git a/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c b/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c
new file mode 100644
index 000000000000..beb7e2c10211
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c
@@ -0,0 +1,259 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM dirty logging page splitting test
+ *
+ * Based on dirty_log_perf.c
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ * Copyright (C) 2023, Google, Inc.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <linux/bitmap.h>
+
+#include "kvm_util.h"
+#include "test_util.h"
+#include "memstress.h"
+#include "guest_modes.h"
+
+#define VCPUS		2
+#define SLOTS		2
+#define ITERATIONS	2
+
+static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
+
+static enum vm_mem_backing_src_type backing_src = VM_MEM_SRC_ANONYMOUS_HUGETLB;
+
+static u64 dirty_log_manual_caps;
+static bool host_quit;
+static int iteration;
+static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];
+
+struct kvm_page_stats {
+	uint64_t pages_4k;
+	uint64_t pages_2m;
+	uint64_t pages_1g;
+	uint64_t hugepages;
+};
+
+static void get_page_stats(struct kvm_vm *vm, struct kvm_page_stats *stats, const char *stage)
+{
+	stats->pages_4k = vm_get_stat(vm, "pages_4k");
+	stats->pages_2m = vm_get_stat(vm, "pages_2m");
+	stats->pages_1g = vm_get_stat(vm, "pages_1g");
+	stats->hugepages = stats->pages_2m + stats->pages_1g;
+
+	pr_debug("\nPage stats after %s: 4K: %ld 2M: %ld 1G: %ld huge: %ld\n",
+		 stage, stats->pages_4k, stats->pages_2m, stats->pages_1g,
+		 stats->hugepages);
+}
+
+static void run_vcpu_iteration(struct kvm_vm *vm)
+{
+	int i;
+
+	iteration++;
+	for (i = 0; i < VCPUS; i++) {
+		while (READ_ONCE(vcpu_last_completed_iteration[i]) !=
+		       iteration)
+			;
+	}
+}
+
+static void vcpu_worker(struct memstress_vcpu_args *vcpu_args)
+{
+	struct kvm_vcpu *vcpu = vcpu_args->vcpu;
+	int vcpu_idx = vcpu_args->vcpu_idx;
+
+	while (!READ_ONCE(host_quit)) {
+		int current_iteration = READ_ONCE(iteration);
+
+		vcpu_run(vcpu);
+
+		ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC);
+
+		vcpu_last_completed_iteration[vcpu_idx] = current_iteration;
+
+		/* Wait for the start of the next iteration to be signaled. */
+		while (current_iteration == READ_ONCE(iteration) &&
+		       READ_ONCE(iteration) >= 0 &&
+		       !READ_ONCE(host_quit))
+			;
+	}
+}
+
+static void run_test(enum vm_guest_mode mode, void *unused)
+{
+	struct kvm_vm *vm;
+	unsigned long **bitmaps;
+	uint64_t guest_num_pages;
+	uint64_t host_num_pages;
+	uint64_t pages_per_slot;
+	int i;
+	uint64_t total_4k_pages;
+	struct kvm_page_stats stats_populated;
+	struct kvm_page_stats stats_dirty_logging_enabled;
+	struct kvm_page_stats stats_dirty_pass[ITERATIONS];
+	struct kvm_page_stats stats_clear_pass[ITERATIONS];
+	struct kvm_page_stats stats_dirty_logging_disabled;
+	struct kvm_page_stats stats_repopulated;
+
+	vm = memstress_create_vm(mode, VCPUS, guest_percpu_mem_size,
+				 SLOTS, backing_src, false);
+
+	guest_num_pages = (VCPUS * guest_percpu_mem_size) >> vm->page_shift;
+	guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
+	host_num_pages = vm_num_host_pages(mode, guest_num_pages);
+	pages_per_slot = host_num_pages / SLOTS;
+
+	bitmaps = memstress_alloc_bitmaps(SLOTS, pages_per_slot);
+
+	if (dirty_log_manual_caps)
+		vm_enable_cap(vm, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2,
+			      dirty_log_manual_caps);
+
+	/* Start the iterations */
+	iteration = -1;
+	host_quit = false;
+
+	for (i = 0; i < VCPUS; i++)
+		vcpu_last_completed_iteration[i] = -1;
+
+	memstress_start_vcpu_threads(VCPUS, vcpu_worker);
+
+	run_vcpu_iteration(vm);
+	get_page_stats(vm, &stats_populated, "populating memory");
+
+	/* Enable dirty logging */
+	memstress_enable_dirty_logging(vm, SLOTS);
+
+	get_page_stats(vm, &stats_dirty_logging_enabled, "enabling dirty logging");
+
+	while (iteration < ITERATIONS) {
+		run_vcpu_iteration(vm);
+		get_page_stats(vm, &stats_dirty_pass[iteration - 1],
+			       "dirtying memory");
+
+		memstress_get_dirty_log(vm, bitmaps, SLOTS);
+
+		if (dirty_log_manual_caps) {
+			memstress_clear_dirty_log(vm, bitmaps, SLOTS, pages_per_slot);
+
+			get_page_stats(vm, &stats_clear_pass[iteration - 1], "clearing dirty log");
+		}
+	}
+
+	/* Disable dirty logging */
+	memstress_disable_dirty_logging(vm, SLOTS);
+
+	get_page_stats(vm, &stats_dirty_logging_disabled, "disabling dirty logging");
+
+	/* Run vCPUs again to fault pages back in. */
+	run_vcpu_iteration(vm);
+	get_page_stats(vm, &stats_repopulated, "repopulating memory");
+
+	/*
+	 * Tell the vCPU threads to quit.  No need to manually check that vCPUs
+	 * have stopped running after disabling dirty logging, the join will
+	 * wait for them to exit.
+	 */
+	host_quit = true;
+	memstress_join_vcpu_threads(VCPUS);
+
+	memstress_free_bitmaps(bitmaps, SLOTS);
+	memstress_destroy_vm(vm);
+
+	/* Make assertions about the page counts. */
+	total_4k_pages = stats_populated.pages_4k;
+	total_4k_pages += stats_populated.pages_2m * 512;
+	total_4k_pages += stats_populated.pages_1g * 512 * 512;
+
+	/*
+	 * Check that all huge pages were split. Since large pages can only
+	 * exist in the data slot, and the vCPUs should have dirtied all pages
+	 * in the data slot, there should be no huge pages left after splitting.
+	 * Splitting happens at dirty log enable time without
+	 * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and after the first clear pass
+	 * with that capability.
+	 */
+	if (dirty_log_manual_caps) {
+		ASSERT_EQ(stats_clear_pass[0].hugepages, 0);
+		ASSERT_EQ(stats_clear_pass[0].pages_4k, total_4k_pages);
+		ASSERT_EQ(stats_dirty_logging_enabled.hugepages, stats_populated.hugepages);
+	} else {
+		ASSERT_EQ(stats_dirty_logging_enabled.hugepages, 0);
+		ASSERT_EQ(stats_dirty_logging_enabled.pages_4k, total_4k_pages);
+	}
+
+	/*
+	 * Once dirty logging is disabled and the vCPUs have touched all their
+	 * memory again, the page counts should be the same as they were
+	 * right after initial population of memory.
+	 */
+	ASSERT_EQ(stats_populated.pages_4k, stats_repopulated.pages_4k);
+	ASSERT_EQ(stats_populated.pages_2m, stats_repopulated.pages_2m);
+	ASSERT_EQ(stats_populated.pages_1g, stats_repopulated.pages_1g);
+}
+
+static void help(char *name)
+{
+	puts("");
+	printf("usage: %s [-h] [-b vcpu bytes] [-s mem type]\n",
+	       name);
+	puts("");
+	printf(" -b: specify the size of the memory region which should be\n"
+	       "     dirtied by each vCPU. e.g. 10M or 3G.\n"
+	       "     (default: 1G)\n");
+	backing_src_help("-s");
+	puts("");
+}
+
+int main(int argc, char *argv[])
+{
+	int opt;
+
+	TEST_REQUIRE(get_kvm_param_bool("eager_page_split"));
+	TEST_REQUIRE(get_kvm_param_bool("tdp_mmu"));
+
+	while ((opt = getopt(argc, argv, "b:hs:")) != -1) {
+		switch (opt) {
+		case 'b':
+			guest_percpu_mem_size = parse_size(optarg);
+			break;
+		case 'h':
+			help(argv[0]);
+			exit(0);
+		case 's':
+			backing_src = parse_backing_src_type(optarg);
+			break;
+		default:
+			help(argv[0]);
+			exit(1);
+		}
+	}
+
+	if (!is_backing_src_hugetlb(backing_src)) {
+		pr_info("This test will only work reliably with HugeTLB memory. "
+			"It can work with THP, but that is best effort.\n");
+	}
+
+	guest_modes_append_default();
+
+	dirty_log_manual_caps = 0;
+	for_each_guest_mode(run_test, NULL);
+
+	dirty_log_manual_caps =
+		kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
+
+	if (dirty_log_manual_caps) {
+		dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
+					  KVM_DIRTY_LOG_INITIALLY_SET);
+		for_each_guest_mode(run_test, NULL);
+	} else {
+		pr_info("Skipping testing with MANUAL_PROTECT as it is not supported");
+	}
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c
index 251794f83719..7f36c32fa760 100644
--- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c
+++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c
@@ -226,7 +226,7 @@ static void help(char *name)
 	puts("");
 	printf("usage: %s [-h] [-p period_ms] [-t token]\n", name);
 	puts("");
-	printf(" -p: The NX reclaim period in miliseconds.\n");
+	printf(" -p: The NX reclaim period in milliseconds.\n");
 	printf(" -t: The magic token to indicate environment setup is done.\n");
 	printf(" -r: The test has reboot permissions and can disable NX huge pages.\n");
 	puts("");
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
index fa03c8d1ce4e..e710b6e7fb38 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
@@ -116,29 +116,21 @@ static void l1_guest_code(struct vmx_pages *vmx_pages)
 	GUEST_DONE();
 }
 
-static void stable_tsc_check_supported(void)
+static bool system_has_stable_tsc(void)
 {
+	bool tsc_is_stable;
 	FILE *fp;
 	char buf[4];
 
 	fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
 	if (fp == NULL)
-		goto skip_test;
+		return false;
 
-	if (fgets(buf, sizeof(buf), fp) == NULL)
-		goto close_fp;
+	tsc_is_stable = fgets(buf, sizeof(buf), fp) &&
+			!strncmp(buf, "tsc", sizeof(buf));
 
-	if (strncmp(buf, "tsc", sizeof(buf)))
-		goto close_fp;
-
-	fclose(fp);
-	return;
-
-close_fp:
 	fclose(fp);
-skip_test:
-	print_skip("Kernel does not use TSC clocksource - assuming that host TSC is not stable");
-	exit(KSFT_SKIP);
+	return tsc_is_stable;
 }
 
 int main(int argc, char *argv[])
@@ -156,7 +148,7 @@ int main(int argc, char *argv[])
 
 	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
 	TEST_REQUIRE(kvm_has_cap(KVM_CAP_TSC_CONTROL));
-	stable_tsc_check_supported();
+	TEST_REQUIRE(system_has_stable_tsc());
 
 	/*
 	 * We set L1's scale factor to be a random number from 2 to 10.
author	Linus Torvalds <torvalds@linux-foundation.org>	2023-07-03 15:32:22 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2023-07-03 15:32:22 -0700
commit	e8069f5a8e3bdb5fdeeff895780529388592ee7a (patch)
tree	ce35ab85db9b66a7e488707fccdb33ce54f696dd /tools
parent	eded37770c9f80ecd5ba842359c4f1058d9812c3 (diff)
parent	255006adb3da71bb75c334453786df781b415f54 (diff)