From a987370f8e7a1677ae385042644326d9cd145a20 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 10 Mar 2015 19:06:59 +0000
Subject: arm64: KVM: Fix stage-2 PGD allocation to have per-page refcounting

We're using __get_free_pages with to allocate the guest's stage-2
PGD. The standard behaviour of this function is to return a set of
pages where only the head page has a valid refcount.

This behaviour gets us into trouble when we're trying to increment
the refcount on a non-head page:

page:ffff7c00cfb693c0 count:0 mapcount:0 mapping:          (null) index:0x0
flags: 0x4000000000000000()
page dumped because: VM_BUG_ON_PAGE((*({ __attribute__((unused)) typeof((&page->_count)->counter) __var = ( typeof((&page->_count)->counter)) 0; (volatile typeof((&page->_count)->counter) *)&((&page->_count)->counter); })) <= 0)
BUG: failure at include/linux/mm.h:548/get_page()!
Kernel panic - not syncing: BUG!
CPU: 1 PID: 1695 Comm: kvm-vcpu-0 Not tainted 4.0.0-rc1+ #3825
Hardware name: APM X-Gene Mustang board (DT)
Call trace:
[<ffff80000008a09c>] dump_backtrace+0x0/0x13c
[<ffff80000008a1e8>] show_stack+0x10/0x1c
[<ffff800000691da8>] dump_stack+0x74/0x94
[<ffff800000690d78>] panic+0x100/0x240
[<ffff8000000a0bc4>] stage2_get_pmd+0x17c/0x2bc
[<ffff8000000a1dc4>] kvm_handle_guest_abort+0x4b4/0x6b0
[<ffff8000000a420c>] handle_exit+0x58/0x180
[<ffff80000009e7a4>] kvm_arch_vcpu_ioctl_run+0x114/0x45c
[<ffff800000099df4>] kvm_vcpu_ioctl+0x2e0/0x754
[<ffff8000001c0a18>] do_vfs_ioctl+0x424/0x5c8
[<ffff8000001c0bfc>] SyS_ioctl+0x40/0x78
CPU0: stopping

A possible approach for this is to split the compound page using
split_page() at allocation time, and change the teardown path to
free one page at a time.  It turns out that alloc_pages_exact() and
free_pages_exact() does exactly that.

While we're at it, the PGD allocation code is reworked to reduce
duplication.

This has been tested on an X-Gene platform with a 4kB/48bit-VA host
kernel, and kvmtool hacked to place memory in the second page of
the hardware PGD (PUD for the host kernel). Also regression-tested
on a Cubietruck (Cortex-A7).

 [ Reworked to use alloc_pages_exact() and free_pages_exact() and to
   return pointers directly instead of by reference as arguments
    - Christoffer ]

Reported-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/include/asm/kvm_mmu.h | 46 ++++------------------------------------
 1 file changed, 4 insertions(+), 42 deletions(-)

(limited to 'arch/arm64')
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 6458b5373142..a099cd9cdef8 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -171,43 +171,6 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
 #define KVM_PREALLOC_LEVEL	(0)
 #endif
 
-/**
- * kvm_prealloc_hwpgd - allocate inital table for VTTBR
- * @kvm:	The KVM struct pointer for the VM.
- * @pgd:	The kernel pseudo pgd
- *
- * When the kernel uses more levels of page tables than the guest, we allocate
- * a fake PGD and pre-populate it to point to the next-level page table, which
- * will be the real initial page table pointed to by the VTTBR.
- *
- * When KVM_PREALLOC_LEVEL==2, we allocate a single page for the PMD and
- * the kernel will use folded pud.  When KVM_PREALLOC_LEVEL==1, we
- * allocate 2 consecutive PUD pages.
- */
-static inline int kvm_prealloc_hwpgd(struct kvm *kvm, pgd_t *pgd)
-{
-	unsigned int i;
-	unsigned long hwpgd;
-
-	if (KVM_PREALLOC_LEVEL == 0)
-		return 0;
-
-	hwpgd = __get_free_pages(GFP_KERNEL | __GFP_ZERO, PTRS_PER_S2_PGD_SHIFT);
-	if (!hwpgd)
-		return -ENOMEM;
-
-	for (i = 0; i < PTRS_PER_S2_PGD; i++) {
-		if (KVM_PREALLOC_LEVEL == 1)
-			pgd_populate(NULL, pgd + i,
-				     (pud_t *)hwpgd + i * PTRS_PER_PUD);
-		else if (KVM_PREALLOC_LEVEL == 2)
-			pud_populate(NULL, pud_offset(pgd, 0) + i,
-				     (pmd_t *)hwpgd + i * PTRS_PER_PMD);
-	}
-
-	return 0;
-}
-
 static inline void *kvm_get_hwpgd(struct kvm *kvm)
 {
 	pgd_t *pgd = kvm->arch.pgd;
@@ -224,12 +187,11 @@ static inline void *kvm_get_hwpgd(struct kvm *kvm)
 	return pmd_offset(pud, 0);
 }
 
-static inline void kvm_free_hwpgd(struct kvm *kvm)
+static inline unsigned int kvm_get_hwpgd_size(void)
 {
-	if (KVM_PREALLOC_LEVEL > 0) {
-		unsigned long hwpgd = (unsigned long)kvm_get_hwpgd(kvm);
-		free_pages(hwpgd, PTRS_PER_S2_PGD_SHIFT);
-	}
+	if (KVM_PREALLOC_LEVEL > 0)
+		return PTRS_PER_S2_PGD * PAGE_SIZE;
+	return PTRS_PER_S2_PGD * sizeof(pgd_t);
 }
 
 static inline bool kvm_page_empty(void *ptr)
-- 
cgit v1.2.3-58-ga151


From 04b8dc85bf4a64517e3cf20e409eeaa503b15cc1 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 10 Mar 2015 19:07:00 +0000
Subject: arm64: KVM: Do not use pgd_index to index stage-2 pgd

The kernel's pgd_index macro is designed to index a normal, page
sized array. KVM is a bit diffferent, as we can use concatenated
pages to have a bigger address space (for example 40bit IPA with
4kB pages gives us an 8kB PGD.

In the above case, the use of pgd_index will always return an index
inside the first 4kB, which makes a guest that has memory above
0x8000000000 rather unhappy, as it spins forever in a page fault,
whist the host happilly corrupts the lower pgd.

The obvious fix is to get our own kvm_pgd_index that does the right
thing(tm).

Tested on X-Gene with a hacked kvmtool that put memory at a stupidly
high address.

Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/kvm_mmu.h   | 3 ++-
 arch/arm/kvm/mmu.c               | 8 ++++----
 arch/arm64/include/asm/kvm_mmu.h | 2 ++
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'arch/arm64')

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index c57c41dc7e87..4cf48c3aca13 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -149,13 +149,14 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
 	(__boundary - 1 < (end) - 1)? __boundary: (end);		\
 })
 
+#define kvm_pgd_index(addr)			pgd_index(addr)
+
 static inline bool kvm_page_empty(void *ptr)
 {
 	struct page *ptr_page = virt_to_page(ptr);
 	return page_count(ptr_page) == 1;
 }
 
-
 #define kvm_pte_table_empty(kvm, ptep) kvm_page_empty(ptep)
 #define kvm_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp)
 #define kvm_pud_table_empty(kvm, pudp) (0)
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index a48a73c6b866..5656d79c5a44 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -290,7 +290,7 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
 	phys_addr_t addr = start, end = start + size;
 	phys_addr_t next;
 
-	pgd = pgdp + pgd_index(addr);
+	pgd = pgdp + kvm_pgd_index(addr);
 	do {
 		next = kvm_pgd_addr_end(addr, end);
 		if (!pgd_none(*pgd))
@@ -355,7 +355,7 @@ static void stage2_flush_memslot(struct kvm *kvm,
 	phys_addr_t next;
 	pgd_t *pgd;
 
-	pgd = kvm->arch.pgd + pgd_index(addr);
+	pgd = kvm->arch.pgd + kvm_pgd_index(addr);
 	do {
 		next = kvm_pgd_addr_end(addr, end);
 		stage2_flush_puds(kvm, pgd, addr, next);
@@ -830,7 +830,7 @@ static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache
 	pgd_t *pgd;
 	pud_t *pud;
 
-	pgd = kvm->arch.pgd + pgd_index(addr);
+	pgd = kvm->arch.pgd + kvm_pgd_index(addr);
 	if (WARN_ON(pgd_none(*pgd))) {
 		if (!cache)
 			return NULL;
@@ -1120,7 +1120,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
 	pgd_t *pgd;
 	phys_addr_t next;
 
-	pgd = kvm->arch.pgd + pgd_index(addr);
+	pgd = kvm->arch.pgd + kvm_pgd_index(addr);
 	do {
 		/*
 		 * Release kvm_mmu_lock periodically if the memory region is
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index a099cd9cdef8..bbfb600fa822 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -158,6 +158,8 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
 #define PTRS_PER_S2_PGD		(1 << PTRS_PER_S2_PGD_SHIFT)
 #define S2_PGD_ORDER		get_order(PTRS_PER_S2_PGD * sizeof(pgd_t))
 
+#define kvm_pgd_index(addr)	(((addr) >> PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1))
+
 /*
  * If we are concatenating first level stage-2 page tables, we would have less
  * than or equal to 16 pointers in the fake PGD, because that's what the
-- 
cgit v1.2.3-58-ga151


From 84ed7412b5eee1011579b3db7454b9cb6d26fa65 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 10 Mar 2015 19:07:01 +0000
Subject: arm64: KVM: Fix outdated comment about VTCR_EL2.PS

Commit 87366d8cf7b3 ("arm64: Add boot time configuration of
Intermediate Physical Address size") removed the hardcoded setting
of VTCR_EL2.PS to use ID_AA64MMFR0_EL1.PARange instead, but didn't
remove the (now rather misleading) comment.

Fix the comments to match reality (at least for the next few minutes).

Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/include/asm/kvm_arm.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/arm64')

diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 94674eb7e7bb..54bb4ba97441 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -129,6 +129,9 @@
  * 40 bits wide (T0SZ = 24).  Systems with a PARange smaller than 40 bits are
  * not known to exist and will break with this configuration.
  *
+ * VTCR_EL2.PS is extracted from ID_AA64MMFR0_EL1.PARange at boot time
+ * (see hyp-init.S).
+ *
  * Note that when using 4K pages, we concatenate two first level page tables
  * together.
  *
@@ -138,7 +141,6 @@
 #ifdef CONFIG_ARM64_64K_PAGES
 /*
  * Stage2 translation configuration:
- * 40bits output (PS = 2)
  * 40bits input  (T0SZ = 24)
  * 64kB pages (TG0 = 1)
  * 2 level page tables (SL = 1)
@@ -150,7 +152,6 @@
 #else
 /*
  * Stage2 translation configuration:
- * 40bits output (PS = 2)
  * 40bits input  (T0SZ = 24)
  * 4kB pages (TG0 = 0)
  * 3 level page tables (SL = 1)
-- 
cgit v1.2.3-58-ga151


From 285994a62c80f1d72c6924282bcb59608098d5ec Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Wed, 11 Mar 2015 12:20:39 +0000
Subject: arm64: Invalidate the TLB corresponding to intermediate page table
 levels

The ARM architecture allows the caching of intermediate page table
levels and page table freeing requires a sequence like:

	pmd_clear()
	TLB invalidation
	pte page freeing

With commit 5e5f6dc10546 (arm64: mm: enable HAVE_RCU_TABLE_FREE logic),
the page table freeing batching was moved from tlb_remove_page() to
tlb_remove_table(). The former takes care of TLB invalidation as this is
also shared with pte clearing and page cache page freeing. The latter,
however, does not invalidate the TLBs for intermediate page table levels
as it probably relies on the architecture code to do it if required.
When the mm->mm_users < 2, tlb_remove_table() does not do any batching
and page table pages are freed before tlb_finish_mmu() which performs
the actual TLB invalidation.

This patch introduces __tlb_flush_pgtable() for arm64 and calls it from
the {pte,pmd,pud}_free_tlb() directly without relying on deferred page
table freeing.

Fixes: 5e5f6dc10546 arm64: mm: enable HAVE_RCU_TABLE_FREE logic
Reported-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
Tested-by: Steve Capper <steve.capper@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/tlb.h      |  3 +++
 arch/arm64/include/asm/tlbflush.h | 13 +++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'arch/arm64')

diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index c028fe37456f..53d9c354219f 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -48,6 +48,7 @@ static inline void tlb_flush(struct mmu_gather *tlb)
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
 				  unsigned long addr)
 {
+	__flush_tlb_pgtable(tlb->mm, addr);
 	pgtable_page_dtor(pte);
 	tlb_remove_entry(tlb, pte);
 }
@@ -56,6 +57,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
 static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
 				  unsigned long addr)
 {
+	__flush_tlb_pgtable(tlb->mm, addr);
 	tlb_remove_entry(tlb, virt_to_page(pmdp));
 }
 #endif
@@ -64,6 +66,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
 static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp,
 				  unsigned long addr)
 {
+	__flush_tlb_pgtable(tlb->mm, addr);
 	tlb_remove_entry(tlb, virt_to_page(pudp));
 }
 #endif
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 4abe9b945f77..c3bb05b98616 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -143,6 +143,19 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end
 		flush_tlb_all();
 }
 
+/*
+ * Used to invalidate the TLB (walk caches) corresponding to intermediate page
+ * table levels (pgd/pud/pmd).
+ */
+static inline void __flush_tlb_pgtable(struct mm_struct *mm,
+				       unsigned long uaddr)
+{
+	unsigned long addr = uaddr >> 12 | ((unsigned long)ASID(mm) << 48);
+
+	dsb(ishst);
+	asm("tlbi	vae1is, %0" : : "r" (addr));
+	dsb(ish);
+}
 /*
  * On AArch64, the cache coherency is handled via the set_pte_at() function.
  */
-- 
cgit v1.2.3-58-ga151


From 60c0d45a7f7ab4e30452fa14deb23a33e29adbc2 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 6 Mar 2015 15:49:24 +0100
Subject: efi/arm64: use UEFI for system reset and poweroff

If UEFI Runtime Services are available, they are preferred over direct
PSCI calls or other methods to reset the system.

For the reset case, we need to hook into machine_restart(), as the
arm_pm_restart function pointer may be overwritten by modules.

Tested-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/efi.c     | 9 +++++++++
 arch/arm64/kernel/process.c | 8 ++++++++
 2 files changed, 17 insertions(+)

(limited to 'arch/arm64')

diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index b42c7b480e1e..2b8d70164428 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -354,3 +354,12 @@ void efi_virtmap_unload(void)
 	efi_set_pgd(current->active_mm);
 	preempt_enable();
 }
+
+/*
+ * UpdateCapsule() depends on the system being shutdown via
+ * ResetSystem().
+ */
+bool efi_poweroff_required(void)
+{
+	return efi_enabled(EFI_RUNTIME_SERVICES);
+}
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index fde9923af859..c6b1f3b96f45 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -21,6 +21,7 @@
 #include <stdarg.h>
 
 #include <linux/compat.h>
+#include <linux/efi.h>
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
@@ -150,6 +151,13 @@ void machine_restart(char *cmd)
 	local_irq_disable();
 	smp_send_stop();
 
+	/*
+	 * UpdateCapsule() depends on the system being reset via
+	 * ResetSystem().
+	 */
+	if (efi_enabled(EFI_RUNTIME_SERVICES))
+		efi_reboot(reboot_mode, NULL);
+
 	/* Now call the architecture specific reboot code. */
 	if (arm_pm_restart)
 		arm_pm_restart(reboot_mode, cmd);
-- 
cgit v1.2.3-58-ga151


From 947bb7587fc2c1d1f6b89462ef1255ec30d4e682 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 13 Mar 2015 16:21:18 +0100
Subject: arm64: put __boot_cpu_mode label after alignment instead of before

Another one for the big head.S spring cleaning: the label should
be after the .align or it may point to the padding.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/head.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/arm64')

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 8ce88e08c030..07f930540f4a 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -585,8 +585,8 @@ ENDPROC(set_cpu_boot_mode_flag)
  * zeroing of .bss would clobber it.
  */
 	.pushsection	.data..cacheline_aligned
-ENTRY(__boot_cpu_mode)
 	.align	L1_CACHE_SHIFT
+ENTRY(__boot_cpu_mode)
 	.long	BOOT_CPU_MODE_EL2
 	.long	0
 	.popsection
-- 
cgit v1.2.3-58-ga151