summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-08-02 10:17:49 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-08-02 10:17:49 -0700
commit725d410facf9f232832bf6ea14a0c8814d890e06 (patch)
tree47ee0f1783f44b1134dd95f41d72359828a384a2
parent948752d2e010e11b56a877975e7e9158d6d31823 (diff)
parent1773014a975919195be71646fc2c2cad1570fce4 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini: "The bulk of the changes here is a largish change to guest_memfd, delaying the clearing and encryption of guest-private pages until they are actually added to guest page tables. This started as "let's make it impossible to misuse the API" for SEV-SNP; but then it ballooned a bit. The new logic is generally simpler and more ready for hugepage support in guest_memfd. Summary: - fix latent bug in how usage of large pages is determined for confidential VMs - fix "underline too short" in docs - eliminate log spam from limited APIC timer periods - disallow pre-faulting of memory before SEV-SNP VMs are initialized - delay clearing and encrypting private memory until it is added to guest page tables - this change also enables another small cleanup: the checks in SNP_LAUNCH_UPDATE that limit it to non-populated, private pages can now be moved in the common kvm_gmem_populate() function - fix compilation error that the RISC-V merge introduced in selftests" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: x86/mmu: fix determination of max NPT mapping level for private pages KVM: riscv: selftests: Fix compile error KVM: guest_memfd: abstract how prepared folios are recorded KVM: guest_memfd: let kvm_gmem_populate() operate only on private gfns KVM: extend kvm_range_has_memory_attributes() to check subset of attributes KVM: cleanup and add shortcuts to kvm_range_has_memory_attributes() KVM: guest_memfd: move check for already-populated page to common code KVM: remove kvm_arch_gmem_prepare_needed() KVM: guest_memfd: make kvm_gmem_prepare_folio() operate on a single struct kvm KVM: guest_memfd: delay kvm_gmem_prepare_folio() until the memory is passed to the guest KVM: guest_memfd: return locked folio from __kvm_gmem_get_pfn KVM: rename CONFIG_HAVE_KVM_GMEM_* to CONFIG_HAVE_KVM_ARCH_GMEM_* KVM: guest_memfd: do not go through struct page KVM: guest_memfd: delay folio_mark_uptodate() until after successful preparation KVM: guest_memfd: return folio from __kvm_gmem_get_pfn() KVM: x86: disallow pre-fault for SNP VMs before initialization KVM: Documentation: Fix title underline too short warning KVM: x86: Eliminate log spam from limited APIC timer periods
-rw-r--r--Documentation/virt/kvm/api.rst8
-rw-r--r--arch/x86/include/asm/kvm_host.h1
-rw-r--r--arch/x86/kvm/Kconfig4
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/mmu/mmu.c7
-rw-r--r--arch/x86/kvm/svm/sev.c17
-rw-r--r--arch/x86/kvm/svm/svm.c1
-rw-r--r--arch/x86/kvm/x86.c12
-rw-r--r--include/linux/kvm_host.h9
-rw-r--r--tools/testing/selftests/kvm/riscv/get-reg-list.c8
-rw-r--r--virt/kvm/Kconfig4
-rw-r--r--virt/kvm/guest_memfd.c227
-rw-r--r--virt/kvm/kvm_main.c49
13 files changed, 202 insertions, 147 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index fe722c5dada9..33938468d62d 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6368,7 +6368,7 @@ a single guest_memfd file, but the bound ranges must not overlap).
See KVM_SET_USER_MEMORY_REGION2 for additional details.
4.143 KVM_PRE_FAULT_MEMORY
-------------------------
+---------------------------
:Capability: KVM_CAP_PRE_FAULT_MEMORY
:Architectures: none
@@ -6405,6 +6405,12 @@ for the current vCPU state. KVM maps memory as if the vCPU generated a
stage-2 read page fault, e.g. faults in memory as needed, but doesn't break
CoW. However, KVM does not mark any newly created stage-2 PTE as Accessed.
+In the case of confidential VM types where there is an initial set up of
+private guest memory before the guest is 'finalized'/measured, this ioctl
+should only be issued after completing all the necessary setup to put the
+guest into a 'finalized' state so that the above semantics can be reliably
+ensured.
+
In some cases, multiple vCPUs might share the page tables. In this
case, the ioctl can be called in parallel.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 950a03e0181e..94e7b5a4fafe 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1305,6 +1305,7 @@ struct kvm_arch {
u8 vm_type;
bool has_private_mem;
bool has_protected_state;
+ bool pre_fault_allowed;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
struct list_head active_mmu_pages;
struct list_head zapped_obsolete_pages;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 4287a8071a3a..472a1537b7a9 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -141,8 +141,8 @@ config KVM_AMD_SEV
depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
select ARCH_HAS_CC_PLATFORM
select KVM_GENERIC_PRIVATE_MEM
- select HAVE_KVM_GMEM_PREPARE
- select HAVE_KVM_GMEM_INVALIDATE
+ select HAVE_KVM_ARCH_GMEM_PREPARE
+ select HAVE_KVM_ARCH_GMEM_INVALIDATE
help
Provides support for launching Encrypted VMs (SEV) and Encrypted VMs
with Encrypted State (SEV-ES) on AMD processors.
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a7172ba59ad2..4915acdbfcd8 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1743,7 +1743,7 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
s64 min_period = min_timer_period_us * 1000LL;
if (apic->lapic_timer.period < min_period) {
- pr_info_ratelimited(
+ pr_info_once(
"vcpu %i: requested %lld ns "
"lapic timer period limited to %lld ns\n",
apic->vcpu->vcpu_id,
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 901be9e420a4..928cf84778b0 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4335,7 +4335,7 @@ static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
if (req_max_level)
max_level = min(max_level, req_max_level);
- return req_max_level;
+ return max_level;
}
static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
@@ -4743,6 +4743,9 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
u64 end;
int r;
+ if (!vcpu->kvm->arch.pre_fault_allowed)
+ return -EOPNOTSUPP;
+
/*
* reload is efficient when called repeatedly, so we can do it on
* every iteration.
@@ -7510,7 +7513,7 @@ static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
if (level == PG_LEVEL_2M)
- return kvm_range_has_memory_attributes(kvm, start, end, attrs);
+ return kvm_range_has_memory_attributes(kvm, start, end, ~0, attrs);
for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) {
if (hugepage_test_mixed(slot, gfn, level - 1) ||
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index a16c873b3232..532df12b43c5 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2279,18 +2279,11 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf
bool assigned;
int level;
- if (!kvm_mem_is_private(kvm, gfn)) {
- pr_debug("%s: Failed to ensure GFN 0x%llx has private memory attribute set\n",
- __func__, gfn);
- ret = -EINVAL;
- goto err;
- }
-
ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level);
if (ret || assigned) {
pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
__func__, gfn, ret, assigned);
- ret = -EINVAL;
+ ret = ret ? -EINVAL : -EEXIST;
goto err;
}
@@ -2549,6 +2542,14 @@ static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
data->gctx_paddr = __psp_pa(sev->snp_context);
ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error);
+ /*
+ * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages
+ * can be given to the guest simply by marking the RMP entry as private.
+ * This can happen on first access and also with KVM_PRE_FAULT_MEMORY.
+ */
+ if (!ret)
+ kvm->arch.pre_fault_allowed = true;
+
kfree(id_auth);
e_free_id_block:
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c115d26844f7..d6f252555ab3 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4949,6 +4949,7 @@ static int svm_vm_init(struct kvm *kvm)
to_kvm_sev_info(kvm)->need_init = true;
kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM);
+ kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem;
}
if (!pause_filter_count || !pause_filter_thresh)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index af6c8cf6a37a..ef3d3511e4af 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12646,6 +12646,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.vm_type = type;
kvm->arch.has_private_mem =
(type == KVM_X86_SW_PROTECTED_VM);
+ /* Decided by the vendor code for other VM types. */
+ kvm->arch.pre_fault_allowed =
+ type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM;
ret = kvm_page_track_init(kvm);
if (ret)
@@ -13641,19 +13644,14 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
-#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
-bool kvm_arch_gmem_prepare_needed(struct kvm *kvm)
-{
- return kvm->arch.vm_type == KVM_X86_SNP_VM;
-}
-
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order)
{
return kvm_x86_call(gmem_prepare)(kvm, pfn, gfn, max_order);
}
#endif
-#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
{
kvm_x86_call(gmem_invalidate)(start, end);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 689e8be873a7..79a6b1a63027 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2414,7 +2414,7 @@ static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn
}
bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
- unsigned long attrs);
+ unsigned long mask, unsigned long attrs);
bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
struct kvm_gfn_range *range);
bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
@@ -2445,11 +2445,11 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm,
}
#endif /* CONFIG_KVM_PRIVATE_MEM */
-#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order);
-bool kvm_arch_gmem_prepare_needed(struct kvm *kvm);
#endif
+#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM
/**
* kvm_gmem_populate() - Populate/prepare a GPA range with guest data
*
@@ -2476,8 +2476,9 @@ typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
kvm_gmem_populate_cb post_populate, void *opaque);
+#endif
-#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
#endif
diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c
index f92c2fb23fcd..8e34f7fa44e9 100644
--- a/tools/testing/selftests/kvm/riscv/get-reg-list.c
+++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c
@@ -961,10 +961,10 @@ KVM_ISA_EXT_SIMPLE_CONFIG(zbkb, ZBKB);
KVM_ISA_EXT_SIMPLE_CONFIG(zbkc, ZBKC);
KVM_ISA_EXT_SIMPLE_CONFIG(zbkx, ZBKX);
KVM_ISA_EXT_SIMPLE_CONFIG(zbs, ZBS);
-KVM_ISA_EXT_SIMPLE_CONFIG(zca, ZCA),
-KVM_ISA_EXT_SIMPLE_CONFIG(zcb, ZCB),
-KVM_ISA_EXT_SIMPLE_CONFIG(zcd, ZCD),
-KVM_ISA_EXT_SIMPLE_CONFIG(zcf, ZCF),
+KVM_ISA_EXT_SIMPLE_CONFIG(zca, ZCA);
+KVM_ISA_EXT_SIMPLE_CONFIG(zcb, ZCB);
+KVM_ISA_EXT_SIMPLE_CONFIG(zcd, ZCD);
+KVM_ISA_EXT_SIMPLE_CONFIG(zcf, ZCF);
KVM_ISA_EXT_SIMPLE_CONFIG(zcmop, ZCMOP);
KVM_ISA_EXT_SIMPLE_CONFIG(zfa, ZFA);
KVM_ISA_EXT_SIMPLE_CONFIG(zfh, ZFH);
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index b14e14cdbfb9..fd6a3010afa8 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -113,10 +113,10 @@ config KVM_GENERIC_PRIVATE_MEM
select KVM_PRIVATE_MEM
bool
-config HAVE_KVM_GMEM_PREPARE
+config HAVE_KVM_ARCH_GMEM_PREPARE
bool
depends on KVM_PRIVATE_MEM
-config HAVE_KVM_GMEM_INVALIDATE
+config HAVE_KVM_ARCH_GMEM_INVALIDATE
bool
depends on KVM_PRIVATE_MEM
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 1c509c351261..8f079a61a56d 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -13,84 +13,93 @@ struct kvm_gmem {
struct list_head entry;
};
-static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio)
+/**
+ * folio_file_pfn - like folio_file_page, but return a pfn.
+ * @folio: The folio which contains this index.
+ * @index: The index we want to look up.
+ *
+ * Return: The pfn for this index.
+ */
+static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
{
-#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
- struct list_head *gmem_list = &inode->i_mapping->i_private_list;
- struct kvm_gmem *gmem;
+ return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
+}
- list_for_each_entry(gmem, gmem_list, entry) {
- struct kvm_memory_slot *slot;
- struct kvm *kvm = gmem->kvm;
- struct page *page;
- kvm_pfn_t pfn;
- gfn_t gfn;
- int rc;
-
- if (!kvm_arch_gmem_prepare_needed(kvm))
- continue;
-
- slot = xa_load(&gmem->bindings, index);
- if (!slot)
- continue;
-
- page = folio_file_page(folio, index);
- pfn = page_to_pfn(page);
- gfn = slot->base_gfn + index - slot->gmem.pgoff;
- rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, compound_order(compound_head(page)));
- if (rc) {
- pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
- index, gfn, pfn, rc);
- return rc;
- }
+static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
+ pgoff_t index, struct folio *folio)
+{
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
+ kvm_pfn_t pfn = folio_file_pfn(folio, index);
+ gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff;
+ int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio));
+ if (rc) {
+ pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
+ index, gfn, pfn, rc);
+ return rc;
}
-
#endif
+
return 0;
}
-static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prepare)
+static inline void kvm_gmem_mark_prepared(struct folio *folio)
{
- struct folio *folio;
+ folio_mark_uptodate(folio);
+}
- /* TODO: Support huge pages. */
- folio = filemap_grab_folio(inode->i_mapping, index);
- if (IS_ERR(folio))
- return folio;
+/*
+ * Process @folio, which contains @gfn, so that the guest can use it.
+ * The folio must be locked and the gfn must be contained in @slot.
+ * On successful return the guest sees a zero page so as to avoid
+ * leaking host data and the up-to-date flag is set.
+ */
+static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, struct folio *folio)
+{
+ unsigned long nr_pages, i;
+ pgoff_t index;
+ int r;
+
+ nr_pages = folio_nr_pages(folio);
+ for (i = 0; i < nr_pages; i++)
+ clear_highpage(folio_page(folio, i));
/*
- * Use the up-to-date flag to track whether or not the memory has been
- * zeroed before being handed off to the guest. There is no backing
- * storage for the memory, so the folio will remain up-to-date until
- * it's removed.
+ * Preparing huge folios should always be safe, since it should
+ * be possible to split them later if needed.
*
- * TODO: Skip clearing pages when trusted firmware will do it when
- * assigning memory to the guest.
+ * Right now the folio order is always going to be zero, but the
+ * code is ready for huge folios. The only assumption is that
+ * the base pgoff of memslots is naturally aligned with the
+ * requested page order, ensuring that huge folios can also use
+ * huge page table entries for GPA->HPA mapping.
+ *
+ * The order will be passed when creating the guest_memfd, and
+ * checked when creating memslots.
*/
- if (!folio_test_uptodate(folio)) {
- unsigned long nr_pages = folio_nr_pages(folio);
- unsigned long i;
-
- for (i = 0; i < nr_pages; i++)
- clear_highpage(folio_page(folio, i));
-
- folio_mark_uptodate(folio);
- }
+ WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio)));
+ index = gfn - slot->base_gfn + slot->gmem.pgoff;
+ index = ALIGN_DOWN(index, 1 << folio_order(folio));
+ r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
+ if (!r)
+ kvm_gmem_mark_prepared(folio);
- if (prepare) {
- int r = kvm_gmem_prepare_folio(inode, index, folio);
- if (r < 0) {
- folio_unlock(folio);
- folio_put(folio);
- return ERR_PTR(r);
- }
- }
+ return r;
+}
- /*
- * Ignore accessed, referenced, and dirty flags. The memory is
- * unevictable and there is no storage to write back to.
- */
- return folio;
+/*
+ * Returns a locked folio on success. The caller is responsible for
+ * setting the up-to-date flag before the memory is mapped into the guest.
+ * There is no backing storage for the memory, so the folio will remain
+ * up-to-date until it's removed.
+ *
+ * Ignore accessed, referenced, and dirty flags. The memory is
+ * unevictable and there is no storage to write back to.
+ */
+static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
+{
+ /* TODO: Support huge pages. */
+ return filemap_grab_folio(inode->i_mapping, index);
}
static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
@@ -190,7 +199,7 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
break;
}
- folio = kvm_gmem_get_folio(inode, index, true);
+ folio = kvm_gmem_get_folio(inode, index);
if (IS_ERR(folio)) {
r = PTR_ERR(folio);
break;
@@ -343,7 +352,7 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
return MF_DELAYED;
}
-#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
static void kvm_gmem_free_folio(struct folio *folio)
{
struct page *page = folio_page(folio, 0);
@@ -358,7 +367,7 @@ static const struct address_space_operations kvm_gmem_aops = {
.dirty_folio = noop_dirty_folio,
.migrate_folio = kvm_gmem_migrate_folio,
.error_remove_folio = kvm_gmem_error_folio,
-#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
.free_folio = kvm_gmem_free_folio,
#endif
};
@@ -541,64 +550,76 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot)
fput(file);
}
-static int __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
- gfn_t gfn, kvm_pfn_t *pfn, int *max_order, bool prepare)
+/* Returns a locked folio on success. */
+static struct folio *
+__kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
+ gfn_t gfn, kvm_pfn_t *pfn, bool *is_prepared,
+ int *max_order)
{
pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
struct kvm_gmem *gmem = file->private_data;
struct folio *folio;
- struct page *page;
- int r;
if (file != slot->gmem.file) {
WARN_ON_ONCE(slot->gmem.file);
- return -EFAULT;
+ return ERR_PTR(-EFAULT);
}
gmem = file->private_data;
if (xa_load(&gmem->bindings, index) != slot) {
WARN_ON_ONCE(xa_load(&gmem->bindings, index));
- return -EIO;
+ return ERR_PTR(-EIO);
}
- folio = kvm_gmem_get_folio(file_inode(file), index, prepare);
+ folio = kvm_gmem_get_folio(file_inode(file), index);
if (IS_ERR(folio))
- return PTR_ERR(folio);
+ return folio;
if (folio_test_hwpoison(folio)) {
folio_unlock(folio);
folio_put(folio);
- return -EHWPOISON;
+ return ERR_PTR(-EHWPOISON);
}
- page = folio_file_page(folio, index);
-
- *pfn = page_to_pfn(page);
+ *pfn = folio_file_pfn(folio, index);
if (max_order)
*max_order = 0;
- r = 0;
-
- folio_unlock(folio);
-
- return r;
+ *is_prepared = folio_test_uptodate(folio);
+ return folio;
}
int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
{
struct file *file = kvm_gmem_get_file(slot);
- int r;
+ struct folio *folio;
+ bool is_prepared = false;
+ int r = 0;
if (!file)
return -EFAULT;
- r = __kvm_gmem_get_pfn(file, slot, gfn, pfn, max_order, true);
+ folio = __kvm_gmem_get_pfn(file, slot, gfn, pfn, &is_prepared, max_order);
+ if (IS_ERR(folio)) {
+ r = PTR_ERR(folio);
+ goto out;
+ }
+
+ if (!is_prepared)
+ r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
+
+ folio_unlock(folio);
+ if (r < 0)
+ folio_put(folio);
+
+out:
fput(file);
return r;
}
EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
+#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM
long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
kvm_gmem_populate_cb post_populate, void *opaque)
{
@@ -625,7 +646,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
for (i = 0; i < npages; i += (1 << max_order)) {
+ struct folio *folio;
gfn_t gfn = start_gfn + i;
+ bool is_prepared = false;
kvm_pfn_t pfn;
if (signal_pending(current)) {
@@ -633,18 +656,39 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
break;
}
- ret = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &max_order, false);
- if (ret)
+ folio = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &is_prepared, &max_order);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
break;
+ }
- if (!IS_ALIGNED(gfn, (1 << max_order)) ||
- (npages - i) < (1 << max_order))
- max_order = 0;
+ if (is_prepared) {
+ folio_unlock(folio);
+ folio_put(folio);
+ ret = -EEXIST;
+ break;
+ }
+
+ folio_unlock(folio);
+ WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
+ (npages - i) < (1 << max_order));
+
+ ret = -EINVAL;
+ while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order),
+ KVM_MEMORY_ATTRIBUTE_PRIVATE,
+ KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
+ if (!max_order)
+ goto put_folio_and_exit;
+ max_order--;
+ }
p = src ? src + i * PAGE_SIZE : NULL;
ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
+ if (!ret)
+ kvm_gmem_mark_prepared(folio);
- put_page(pfn_to_page(pfn));
+put_folio_and_exit:
+ folio_put(folio);
if (ret)
break;
}
@@ -655,3 +699,4 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
return ret && !i ? ret : i;
}
EXPORT_SYMBOL_GPL(kvm_gmem_populate);
+#endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d0788d0a72cc..92901656a0d4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2398,48 +2398,47 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+static u64 kvm_supported_mem_attributes(struct kvm *kvm)
+{
+ if (!kvm || kvm_arch_has_private_mem(kvm))
+ return KVM_MEMORY_ATTRIBUTE_PRIVATE;
+
+ return 0;
+}
+
/*
* Returns true if _all_ gfns in the range [@start, @end) have attributes
- * matching @attrs.
+ * such that the bits in @mask match @attrs.
*/
bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
- unsigned long attrs)
+ unsigned long mask, unsigned long attrs)
{
XA_STATE(xas, &kvm->mem_attr_array, start);
unsigned long index;
- bool has_attrs;
void *entry;
- rcu_read_lock();
+ mask &= kvm_supported_mem_attributes(kvm);
+ if (attrs & ~mask)
+ return false;
- if (!attrs) {
- has_attrs = !xas_find(&xas, end - 1);
- goto out;
- }
+ if (end == start + 1)
+ return (kvm_get_memory_attributes(kvm, start) & mask) == attrs;
+
+ guard(rcu)();
+ if (!attrs)
+ return !xas_find(&xas, end - 1);
- has_attrs = true;
for (index = start; index < end; index++) {
do {
entry = xas_next(&xas);
} while (xas_retry(&xas, entry));
- if (xas.xa_index != index || xa_to_value(entry) != attrs) {
- has_attrs = false;
- break;
- }
+ if (xas.xa_index != index ||
+ (xa_to_value(entry) & mask) != attrs)
+ return false;
}
-out:
- rcu_read_unlock();
- return has_attrs;
-}
-
-static u64 kvm_supported_mem_attributes(struct kvm *kvm)
-{
- if (!kvm || kvm_arch_has_private_mem(kvm))
- return KVM_MEMORY_ATTRIBUTE_PRIVATE;
-
- return 0;
+ return true;
}
static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
@@ -2534,7 +2533,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
mutex_lock(&kvm->slots_lock);
/* Nothing to do if the entire range as the desired attributes. */
- if (kvm_range_has_memory_attributes(kvm, start, end, attributes))
+ if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes))
goto out_unlock;
/*