From a3118beb6a8cbe77ae3342125d920205871b0717 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 28 Jun 2012 22:12:36 -0400
Subject: [PATCH 01/15] xen/p2m: Fix the comment describing the P2M tree.

It mixed up the p2m_mid_missing with p2m_missing. Also
remove some extra spaces.

Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/p2m.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 64effdc6da94..e4adbfbdfada 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -22,7 +22,7 @@
  *
  * P2M_PER_PAGE depends on the architecture, as a mfn is always
  * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
- * 512 and 1024 entries respectively. 
+ * 512 and 1024 entries respectively.
  *
  * In short, these structures contain the Machine Frame Number (MFN) of the PFN.
  *
@@ -139,11 +139,11 @@
  *      /    | ~0, ~0, ....  |
  *     |     \---------------/
  *     |
- *     p2m_missing             p2m_missing
- * /------------------\     /------------\
- * | [p2m_mid_missing]+---->| ~0, ~0, ~0 |
- * | [p2m_mid_missing]+---->| ..., ~0    |
- * \------------------/     \------------/
+ *   p2m_mid_missing           p2m_missing
+ * /-----------------\     /------------\
+ * | [p2m_missing]   +---->| ~0, ~0, ~0 |
+ * | [p2m_missing]   +---->| ..., ~0    |
+ * \-----------------/     \------------/
  *
  * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)
  */
@@ -423,7 +423,7 @@ static void free_p2m_page(void *p)
 	free_page((unsigned long)p);
 }
 
-/* 
+/*
  * Fully allocate the p2m structure for a given pfn.  We need to check
  * that both the top and mid levels are allocated, and make sure the
  * parallel mfn tree is kept in sync.  We may race with other cpus, so

From 59b294403e9814e7c1154043567f0d71bac7a511 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 19 Jul 2012 10:23:47 -0400
Subject: [PATCH 02/15] xen/x86: Use memblock_reserve for sensitive areas.

instead of a big memblock_reserve. This way we can be more
selective in freeing regions (and it also makes it easier
to understand where is what).

[v1: Move the auto_translate_physmap to proper line]
[v2: Per Stefano suggestion add more comments]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c | 48 ++++++++++++++++++++++++++++++++++++++++
 arch/x86/xen/p2m.c       |  5 +++++
 arch/x86/xen/setup.c     |  9 --------
 3 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ff962d4b821e..e532eb50e8d7 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -998,7 +998,54 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 
 	return ret;
 }
+/*
+ * If the MFN is not in the m2p (provided to us by the hypervisor) this
+ * function won't do anything. In practice this means that the XenBus
+ * MFN won't be available for the initial domain. */
+static void __init xen_reserve_mfn(unsigned long mfn)
+{
+	unsigned long pfn;
 
+	if (!mfn)
+		return;
+	pfn = mfn_to_pfn(mfn);
+	if (phys_to_machine_mapping_valid(pfn))
+		memblock_reserve(PFN_PHYS(pfn), PAGE_SIZE);
+}
+static void __init xen_reserve_internals(void)
+{
+	unsigned long size;
+
+	if (!xen_pv_domain())
+		return;
+
+	/* xen_start_info does not exist in the M2P, hence can't use
+	 * xen_reserve_mfn. */
+	memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
+
+	xen_reserve_mfn(PFN_DOWN(xen_start_info->shared_info));
+	xen_reserve_mfn(xen_start_info->store_mfn);
+
+	if (!xen_initial_domain())
+		xen_reserve_mfn(xen_start_info->console.domU.mfn);
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
+	/*
+	 * ALIGN up to compensate for the p2m_page pointing to an array that
+	 * can partially filled (look in xen_build_dynamic_phys_to_machine).
+	 */
+
+	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+
+	/* We could use xen_reserve_mfn here, but would end up looping quite
+	 * a lot (and call memblock_reserve for each PAGE), so lets just use
+	 * the easy way and reserve it wholesale. */
+	memblock_reserve(__pa(xen_start_info->mfn_list), size);
+
+	/* The pagetables are reserved in mmu.c */
+}
 void xen_setup_shared_info(void)
 {
 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
@@ -1362,6 +1409,7 @@ asmlinkage void __init xen_start_kernel(void)
 	xen_raw_console_write("mapping kernel into physical memory\n");
 	pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
 
+	xen_reserve_internals();
 	/* Allocate and initialize top and mid mfn levels for p2m structure */
 	xen_build_mfn_list_list();
 
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index e4adbfbdfada..6a2bfa43c8a1 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -388,6 +388,11 @@ void __init xen_build_dynamic_phys_to_machine(void)
 	}
 
 	m2p_override_init();
+
+	/* NOTE: We cannot call memblock_reserve here for the mfn_list as there
+	 * isn't enough pieces to make it work (for one - we are still using the
+	 * Xen provided pagetable). Do it later in xen_reserve_internals.
+	 */
 }
 
 unsigned long get_phys_to_machine(unsigned long pfn)
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index a4790bf22c59..9efca750405d 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -424,15 +424,6 @@ char * __init xen_memory_setup(void)
 	e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
 			E820_RESERVED);
 
-	/*
-	 * Reserve Xen bits:
-	 *  - mfn_list
-	 *  - xen_start_info
-	 * See comment above "struct start_info" in <xen/interface/xen.h>
-	 */
-	memblock_reserve(__pa(xen_start_info->mfn_list),
-			 xen_start_info->pt_base - xen_start_info->mfn_list);
-
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 
 	return "Xen";

From 806c312e50f122c47913145cf884f53dd09d9199 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 21 Aug 2012 14:31:24 -0400
Subject: [PATCH 03/15] xen/x86: Workaround 64-bit hypervisor and 32-bit
 initial domain.

If a 64-bit hypervisor is booted with a 32-bit initial domain,
the hypervisor deals with the initial domain as "compat" and
does some extra adjustments (like pagetables are 4 bytes instead
of 8). It also adjusts the xen_start_info->pt_base incorrectly.

When booted with a 32-bit hypervisor (32-bit initial domain):
..
(XEN)  Start info:    cf831000->cf83147c
(XEN)  Page tables:   cf832000->cf8b5000
..
[    0.000000] PT: cf832000 (f832000)
[    0.000000] Reserving PT: f832000->f8b5000

And with a 64-bit hypervisor:
(XEN)  Start info:    00000000cf831000->00000000cf8314b4
(XEN)  Page tables:   00000000cf832000->00000000cf8b6000

[    0.000000] PT: cf834000 (f834000)
[    0.000000] Reserving PT: f834000->f8b8000

To deal with this, we keep keep track of the highest physical
address we have reserved via memblock_reserve. If that address
does not overlap with pt_base, we have a gap which we reserve.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index e532eb50e8d7..511f92d79e4a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1002,19 +1002,24 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
  * If the MFN is not in the m2p (provided to us by the hypervisor) this
  * function won't do anything. In practice this means that the XenBus
  * MFN won't be available for the initial domain. */
-static void __init xen_reserve_mfn(unsigned long mfn)
+static unsigned long __init xen_reserve_mfn(unsigned long mfn)
 {
-	unsigned long pfn;
+	unsigned long pfn, end_pfn = 0;
 
 	if (!mfn)
-		return;
+		return end_pfn;
+
 	pfn = mfn_to_pfn(mfn);
-	if (phys_to_machine_mapping_valid(pfn))
-		memblock_reserve(PFN_PHYS(pfn), PAGE_SIZE);
+	if (phys_to_machine_mapping_valid(pfn)) {
+		end_pfn = PFN_PHYS(pfn) + PAGE_SIZE;
+		memblock_reserve(PFN_PHYS(pfn), end_pfn);
+	}
+	return end_pfn;
 }
 static void __init xen_reserve_internals(void)
 {
 	unsigned long size;
+	unsigned long last_phys = 0;
 
 	if (!xen_pv_domain())
 		return;
@@ -1022,12 +1027,13 @@ static void __init xen_reserve_internals(void)
 	/* xen_start_info does not exist in the M2P, hence can't use
 	 * xen_reserve_mfn. */
 	memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
+	last_phys = __pa(xen_start_info) + PAGE_SIZE;
 
-	xen_reserve_mfn(PFN_DOWN(xen_start_info->shared_info));
-	xen_reserve_mfn(xen_start_info->store_mfn);
+	last_phys = max(xen_reserve_mfn(PFN_DOWN(xen_start_info->shared_info)), last_phys);
+	last_phys = max(xen_reserve_mfn(xen_start_info->store_mfn), last_phys);
 
 	if (!xen_initial_domain())
-		xen_reserve_mfn(xen_start_info->console.domU.mfn);
+		last_phys = max(xen_reserve_mfn(xen_start_info->console.domU.mfn), last_phys);
 
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return;
@@ -1043,8 +1049,14 @@ static void __init xen_reserve_internals(void)
 	 * a lot (and call memblock_reserve for each PAGE), so lets just use
 	 * the easy way and reserve it wholesale. */
 	memblock_reserve(__pa(xen_start_info->mfn_list), size);
-
+	last_phys = max(__pa(xen_start_info->mfn_list) + size, last_phys);
 	/* The pagetables are reserved in mmu.c */
+
+	/* Under 64-bit hypervisor with a 32-bit domain, the hypervisor
+	 * offsets the pt_base by two pages. Hence the reservation that is done
+	 * in mmu.c misses two pages. We correct it here if we detect this. */
+	if (last_phys < __pa(xen_start_info->pt_base))
+		memblock_reserve(last_phys, __pa(xen_start_info->pt_base) - last_phys);
 }
 void xen_setup_shared_info(void)
 {

From 51faaf2b0d5c7f44d82964f0c70b1c4e44d4e633 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 22 Aug 2012 13:00:10 -0400
Subject: [PATCH 04/15] Revert "xen/x86: Workaround 64-bit hypervisor and
 32-bit initial domain." and "xen/x86: Use memblock_reserve for sensitive
 areas."

This reverts commit 806c312e50f122c47913145cf884f53dd09d9199 and
commit 59b294403e9814e7c1154043567f0d71bac7a511.

And also documents setup.c and why we want to do it that way, which
is that we tried to make the the memblock_reserve more selective so
that it would be clear what region is reserved. Sadly we ran
in the problem wherein on a 64-bit hypervisor with a 32-bit
initial domain, the pt_base has the cr3 value which is not
neccessarily where the pagetable starts! As Jan put it: "
Actually, the adjustment turns out to be correct: The page
tables for a 32-on-64 dom0 get allocated in the order "first L1",
"first L2", "first L3", so the offset to the page table base is
indeed 2. When reading xen/include/public/xen.h's comment
very strictly, this is not a violation (since there nothing is said
that the first thing in the page table space is pointed to by
pt_base; I admit that this seems to be implied though, namely
do I think that it is implied that the page table space is the
range [pt_base, pt_base + nt_pt_frames), whereas that
range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
which - without a priori knowledge - the kernel would have
difficulty to figure out)." - so lets just fall back to the
easy way and reserve the whole region.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c | 60 ----------------------------------------
 arch/x86/xen/p2m.c       |  5 ----
 arch/x86/xen/setup.c     | 27 ++++++++++++++++++
 3 files changed, 27 insertions(+), 65 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 511f92d79e4a..ff962d4b821e 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -998,66 +998,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 
 	return ret;
 }
-/*
- * If the MFN is not in the m2p (provided to us by the hypervisor) this
- * function won't do anything. In practice this means that the XenBus
- * MFN won't be available for the initial domain. */
-static unsigned long __init xen_reserve_mfn(unsigned long mfn)
-{
-	unsigned long pfn, end_pfn = 0;
 
-	if (!mfn)
-		return end_pfn;
-
-	pfn = mfn_to_pfn(mfn);
-	if (phys_to_machine_mapping_valid(pfn)) {
-		end_pfn = PFN_PHYS(pfn) + PAGE_SIZE;
-		memblock_reserve(PFN_PHYS(pfn), end_pfn);
-	}
-	return end_pfn;
-}
-static void __init xen_reserve_internals(void)
-{
-	unsigned long size;
-	unsigned long last_phys = 0;
-
-	if (!xen_pv_domain())
-		return;
-
-	/* xen_start_info does not exist in the M2P, hence can't use
-	 * xen_reserve_mfn. */
-	memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
-	last_phys = __pa(xen_start_info) + PAGE_SIZE;
-
-	last_phys = max(xen_reserve_mfn(PFN_DOWN(xen_start_info->shared_info)), last_phys);
-	last_phys = max(xen_reserve_mfn(xen_start_info->store_mfn), last_phys);
-
-	if (!xen_initial_domain())
-		last_phys = max(xen_reserve_mfn(xen_start_info->console.domU.mfn), last_phys);
-
-	if (xen_feature(XENFEAT_auto_translated_physmap))
-		return;
-
-	/*
-	 * ALIGN up to compensate for the p2m_page pointing to an array that
-	 * can partially filled (look in xen_build_dynamic_phys_to_machine).
-	 */
-
-	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
-
-	/* We could use xen_reserve_mfn here, but would end up looping quite
-	 * a lot (and call memblock_reserve for each PAGE), so lets just use
-	 * the easy way and reserve it wholesale. */
-	memblock_reserve(__pa(xen_start_info->mfn_list), size);
-	last_phys = max(__pa(xen_start_info->mfn_list) + size, last_phys);
-	/* The pagetables are reserved in mmu.c */
-
-	/* Under 64-bit hypervisor with a 32-bit domain, the hypervisor
-	 * offsets the pt_base by two pages. Hence the reservation that is done
-	 * in mmu.c misses two pages. We correct it here if we detect this. */
-	if (last_phys < __pa(xen_start_info->pt_base))
-		memblock_reserve(last_phys, __pa(xen_start_info->pt_base) - last_phys);
-}
 void xen_setup_shared_info(void)
 {
 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
@@ -1421,7 +1362,6 @@ asmlinkage void __init xen_start_kernel(void)
 	xen_raw_console_write("mapping kernel into physical memory\n");
 	pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
 
-	xen_reserve_internals();
 	/* Allocate and initialize top and mid mfn levels for p2m structure */
 	xen_build_mfn_list_list();
 
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 6a2bfa43c8a1..e4adbfbdfada 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -388,11 +388,6 @@ void __init xen_build_dynamic_phys_to_machine(void)
 	}
 
 	m2p_override_init();
-
-	/* NOTE: We cannot call memblock_reserve here for the mfn_list as there
-	 * isn't enough pieces to make it work (for one - we are still using the
-	 * Xen provided pagetable). Do it later in xen_reserve_internals.
-	 */
 }
 
 unsigned long get_phys_to_machine(unsigned long pfn)
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 9efca750405d..740517be4da5 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -424,6 +424,33 @@ char * __init xen_memory_setup(void)
 	e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
 			E820_RESERVED);
 
+	/*
+	 * Reserve Xen bits:
+	 *  - mfn_list
+	 *  - xen_start_info
+	 * See comment above "struct start_info" in <xen/interface/xen.h>
+	 * We tried to make the the memblock_reserve more selective so
+	 * that it would be clear what region is reserved. Sadly we ran
+	 * in the problem wherein on a 64-bit hypervisor with a 32-bit
+	 * initial domain, the pt_base has the cr3 value which is not
+	 * neccessarily where the pagetable starts! As Jan put it: "
+	 * Actually, the adjustment turns out to be correct: The page
+	 * tables for a 32-on-64 dom0 get allocated in the order "first L1",
+	 * "first L2", "first L3", so the offset to the page table base is
+	 * indeed 2. When reading xen/include/public/xen.h's comment
+	 * very strictly, this is not a violation (since there nothing is said
+	 * that the first thing in the page table space is pointed to by
+	 * pt_base; I admit that this seems to be implied though, namely
+	 * do I think that it is implied that the page table space is the
+	 * range [pt_base, pt_base + nt_pt_frames), whereas that
+	 * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
+	 * which - without a priori knowledge - the kernel would have
+	 * difficulty to figure out)." - so lets just fall back to the
+	 * easy way and reserve the whole region.
+	 */
+	memblock_reserve(__pa(xen_start_info->mfn_list),
+			 xen_start_info->pt_base - xen_start_info->mfn_list);
+
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 
 	return "Xen";

From 3699aad047e16a5775b1d051425f422a9384270d Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 28 Jun 2012 22:47:35 -0400
Subject: [PATCH 05/15] xen/mmu: The xen_setup_kernel_pagetable doesn't need to
 return anything.

We don't need to return the new PGD - as we do not use it.

Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c |  5 +----
 arch/x86/xen/mmu.c       | 10 ++--------
 arch/x86/xen/xen-ops.h   |  2 +-
 3 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ff962d4b821e..d87a038a0484 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1258,7 +1258,6 @@ asmlinkage void __init xen_start_kernel(void)
 {
 	struct physdev_set_iopl set_iopl;
 	int rc;
-	pgd_t *pgd;
 
 	if (!xen_start_info)
 		return;
@@ -1350,8 +1349,6 @@ asmlinkage void __init xen_start_kernel(void)
 	acpi_numa = -1;
 #endif
 
-	pgd = (pgd_t *)xen_start_info->pt_base;
-
 	/* Don't do the full vcpu_info placement stuff until we have a
 	   possible map and a non-dummy shared_info. */
 	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
@@ -1360,7 +1357,7 @@ asmlinkage void __init xen_start_kernel(void)
 	early_boot_irqs_disabled = true;
 
 	xen_raw_console_write("mapping kernel into physical memory\n");
-	pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
+	xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages);
 
 	/* Allocate and initialize top and mid mfn levels for p2m structure */
 	xen_build_mfn_list_list();
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3a73785631ce..4ac21a4c6da4 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1719,8 +1719,7 @@ static void convert_pfn_mfn(void *v)
  * of the physical mapping once some sort of allocator has been set
  * up.
  */
-pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
-					 unsigned long max_pfn)
+void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 {
 	pud_t *l3;
 	pmd_t *l2;
@@ -1781,8 +1780,6 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
 
 	memblock_reserve(__pa(xen_start_info->pt_base),
 			 xen_start_info->nr_pt_frames * PAGE_SIZE);
-
-	return pgd;
 }
 #else	/* !CONFIG_X86_64 */
 static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
@@ -1825,8 +1822,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
 	pv_mmu_ops.write_cr3 = &xen_write_cr3;
 }
 
-pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
-					 unsigned long max_pfn)
+void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 {
 	pmd_t *kernel_pmd;
 
@@ -1858,8 +1854,6 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
 
 	memblock_reserve(__pa(xen_start_info->pt_base),
 			 xen_start_info->nr_pt_frames * PAGE_SIZE);
-
-	return initial_page_table;
 }
 #endif	/* CONFIG_X86_64 */
 
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 202d4c150154..2230f57a6ebe 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -27,7 +27,7 @@ void xen_setup_mfn_list_list(void);
 void xen_setup_shared_info(void);
 void xen_build_mfn_list_list(void);
 void xen_setup_machphys_mapping(void);
-pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
+void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
 void xen_reserve_top(void);
 extern unsigned long xen_max_p2m_pfn;
 

From 4fac153a7a260e40e10008a0d7a272719684e4cd Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 12 Jul 2012 13:55:25 -0400
Subject: [PATCH 06/15] xen/mmu: Provide comments describing the _ka and _va
 aliasing issue

Which is that the level2_kernel_pgt (__ka virtual addresses)
and level2_ident_pgt (__va virtual address) contain the same
PMD entries. So if you modify a PTE in __ka, it will be reflected
in __va (and vice-versa).

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 4ac21a4c6da4..6ba610098dd9 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1734,19 +1734,36 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 	init_level4_pgt[0] = __pgd(0);
 
 	/* Pre-constructed entries are in pfn, so convert to mfn */
+	/* L4[272] -> level3_ident_pgt
+	 * L4[511] -> level3_kernel_pgt */
 	convert_pfn_mfn(init_level4_pgt);
+
+	/* L3_i[0] -> level2_ident_pgt */
 	convert_pfn_mfn(level3_ident_pgt);
+	/* L3_k[510] -> level2_kernel_pgt
+	 * L3_i[511] -> level2_fixmap_pgt */
 	convert_pfn_mfn(level3_kernel_pgt);
 
+	/* We get [511][511] and have Xen's version of level2_kernel_pgt */
 	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
 	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
 
+	/* Graft it onto L4[272][0]. Note that we creating an aliasing problem:
+	 * Both L4[272][0] and L4[511][511] have entries that point to the same
+	 * L2 (PMD) tables. Meaning that if you modify it in __va space
+	 * it will be also modified in the __ka space! (But if you just
+	 * modify the PMD table to point to other PTE's or none, then you
+	 * are OK - which is what cleanup_highmap does) */
 	memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+	/* Graft it onto L4[511][511] */
 	memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
 
+	/* Get [511][510] and graft that in level2_fixmap_pgt */
 	l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
 	l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
 	memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+	/* Note that we don't do anything with level1_fixmap_pgt which
+	 * we don't need. */
 
 	/* Set up identity map */
 	xen_map_identity_early(level2_ident_pgt, max_pfn);

From ae895ed7839f918bbc8d5425b8973b25a534f4eb Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 26 Jul 2012 11:57:04 -0400
Subject: [PATCH 07/15] xen/mmu: use copy_page instead of memcpy.

After all, this is what it is there for.

Acked-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 6ba610098dd9..7247e5a62f27 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1754,14 +1754,14 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 	 * it will be also modified in the __ka space! (But if you just
 	 * modify the PMD table to point to other PTE's or none, then you
 	 * are OK - which is what cleanup_highmap does) */
-	memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+	copy_page(level2_ident_pgt, l2);
 	/* Graft it onto L4[511][511] */
-	memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+	copy_page(level2_kernel_pgt, l2);
 
 	/* Get [511][510] and graft that in level2_fixmap_pgt */
 	l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
 	l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
-	memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+	copy_page(level2_fixmap_pgt, l2);
 	/* Note that we don't do anything with level1_fixmap_pgt which
 	 * we don't need. */
 
@@ -1821,8 +1821,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
 	 */
 	swapper_kernel_pmd =
 		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
-	memcpy(swapper_kernel_pmd, initial_kernel_pmd,
-	       sizeof(pmd_t) * PTRS_PER_PMD);
+	copy_page(swapper_kernel_pmd, initial_kernel_pmd);
 	swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
 		__pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
 	set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
@@ -1851,11 +1850,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 				  512*1024);
 
 	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
-	memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
+	copy_page(initial_kernel_pmd, kernel_pmd);
 
 	xen_map_identity_early(initial_kernel_pmd, max_pfn);
 
-	memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
+	copy_page(initial_page_table, pgd);
 	initial_page_table[KERNEL_PGD_BOUNDARY] =
 		__pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
 

From caaf9ecf16feffa4f1a5a0d617bc78906a114514 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 12 Jul 2012 13:59:36 -0400
Subject: [PATCH 08/15] xen/mmu: For 64-bit do not call xen_map_identity_early

B/c we do not need it. During the startup the Xen provides
us with all the initial memory mapped that we need to function.

The initial memory mapped is up to the bootstack, which means
we can reference using __ka up to 4.f):

(from xen/interface/xen.h):

 4. This the order of bootstrap elements in the initial virtual region:
   a. relocated kernel image
   b. initial ram disk              [mod_start, mod_len]
   c. list of allocated page frames [mfn_list, nr_pages]
   d. start_info_t structure        [register ESI (x86)]
   e. bootstrap page tables         [pt_base, CR3 (x86)]
   f. bootstrap stack               [register ESP (x86)]

(initial ram disk may be ommitted).

[v1: More comments in git commit]
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 7247e5a62f27..a59070b09055 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -84,6 +84,7 @@
  */
 DEFINE_SPINLOCK(xen_reservation_lock);
 
+#ifdef CONFIG_X86_32
 /*
  * Identity map, in addition to plain kernel map.  This needs to be
  * large enough to allocate page table pages to allocate the rest.
@@ -91,7 +92,7 @@ DEFINE_SPINLOCK(xen_reservation_lock);
  */
 #define LEVEL1_IDENT_ENTRIES	(PTRS_PER_PTE * 4)
 static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
-
+#endif
 #ifdef CONFIG_X86_64
 /* l3 pud for userspace vsyscall mapping */
 static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
@@ -1628,7 +1629,7 @@ static void set_page_prot(void *addr, pgprot_t prot)
 	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
 		BUG();
 }
-
+#ifdef CONFIG_X86_32
 static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
 {
 	unsigned pmdidx, pteidx;
@@ -1679,7 +1680,7 @@ static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
 
 	set_page_prot(pmd, PAGE_KERNEL_RO);
 }
-
+#endif
 void __init xen_setup_machphys_mapping(void)
 {
 	struct xen_machphys_mapping mapping;
@@ -1765,14 +1766,12 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 	/* Note that we don't do anything with level1_fixmap_pgt which
 	 * we don't need. */
 
-	/* Set up identity map */
-	xen_map_identity_early(level2_ident_pgt, max_pfn);
-
 	/* Make pagetable pieces RO */
 	set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
 	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
 	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
 	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+	set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
 	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
 	set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
 

From 488f046df922af992c1a718eff276529c0510885 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 26 Jul 2012 12:00:56 -0400
Subject: [PATCH 09/15] xen/mmu: Recycle the Xen provided L4, L3, and L2 pages

As we are not using them. We end up only using the L1 pagetables
and grafting those to our page-tables.

[v1: Per Stefano's suggestion squashed two commits]
[v2: Per Stefano's suggestion simplified loop]
[v3: Fix smatch warnings]
[v4: Add more comments]
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 42 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 7 deletions(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index a59070b09055..b44e6a88ea74 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1708,7 +1708,20 @@ static void convert_pfn_mfn(void *v)
 	for (i = 0; i < PTRS_PER_PTE; i++)
 		pte[i] = xen_make_pte(pte[i].pte);
 }
-
+static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
+				 unsigned long addr)
+{
+	if (*pt_base == PFN_DOWN(__pa(addr))) {
+		set_page_prot((void *)addr, PAGE_KERNEL);
+		clear_page((void *)addr);
+		(*pt_base)++;
+	}
+	if (*pt_end == PFN_DOWN(__pa(addr))) {
+		set_page_prot((void *)addr, PAGE_KERNEL);
+		clear_page((void *)addr);
+		(*pt_end)--;
+	}
+}
 /*
  * Set up the initial kernel pagetable.
  *
@@ -1724,6 +1737,9 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 {
 	pud_t *l3;
 	pmd_t *l2;
+	unsigned long addr[3];
+	unsigned long pt_base, pt_end;
+	unsigned i;
 
 	/* max_pfn_mapped is the last pfn mapped in the initial memory
 	 * mappings. Considering that on Xen after the kernel mappings we
@@ -1731,6 +1747,9 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 	 * set max_pfn_mapped to the last real pfn mapped. */
 	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
 
+	pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
+	pt_end = pt_base + xen_start_info->nr_pt_frames;
+
 	/* Zap identity mapping */
 	init_level4_pgt[0] = __pgd(0);
 
@@ -1749,6 +1768,9 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
 	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
 
+	addr[0] = (unsigned long)pgd;
+	addr[1] = (unsigned long)l3;
+	addr[2] = (unsigned long)l2;
 	/* Graft it onto L4[272][0]. Note that we creating an aliasing problem:
 	 * Both L4[272][0] and L4[511][511] have entries that point to the same
 	 * L2 (PMD) tables. Meaning that if you modify it in __va space
@@ -1782,20 +1804,26 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 	/* Unpin Xen-provided one */
 	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
 
-	/* Switch over */
-	pgd = init_level4_pgt;
-
 	/*
 	 * At this stage there can be no user pgd, and no page
 	 * structure to attach it to, so make sure we just set kernel
 	 * pgd.
 	 */
 	xen_mc_batch();
-	__xen_write_cr3(true, __pa(pgd));
+	__xen_write_cr3(true, __pa(init_level4_pgt));
 	xen_mc_issue(PARAVIRT_LAZY_CPU);
 
-	memblock_reserve(__pa(xen_start_info->pt_base),
-			 xen_start_info->nr_pt_frames * PAGE_SIZE);
+	/* We can't that easily rip out L3 and L2, as the Xen pagetables are
+	 * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ...  for
+	 * the initial domain. For guests using the toolstack, they are in:
+	 * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only
+	 * rip out the [L4] (pgd), but for guests we shave off three pages.
+	 */
+	for (i = 0; i < ARRAY_SIZE(addr); i++)
+		check_pt_base(&pt_base, &pt_end, addr[i]);
+
+	/* Our (by three pages) smaller Xen pagetable that we are using */
+	memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE);
 }
 #else	/* !CONFIG_X86_64 */
 static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);

From 357a3cfb147ee8e97c6f9cdc51e9a33aa56f7d99 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 19 Jul 2012 13:52:29 -0400
Subject: [PATCH 10/15] xen/p2m: Add logic to revector a P2M tree to use __va
 leafs.

During bootup Xen supplies us with a P2M array. It sticks
it right after the ramdisk, as can be seen with a 128GB PV guest:

(certain parts removed for clarity):
xc_dom_build_image: called
xc_dom_alloc_segment:   kernel       : 0xffffffff81000000 -> 0xffffffff81e43000  (pfn 0x1000 + 0xe43 pages)
xc_dom_pfn_to_ptr: domU mapping: pfn 0x1000+0xe43 at 0x7f097d8bf000
xc_dom_alloc_segment:   ramdisk      : 0xffffffff81e43000 -> 0xffffffff925c7000  (pfn 0x1e43 + 0x10784 pages)
xc_dom_pfn_to_ptr: domU mapping: pfn 0x1e43+0x10784 at 0x7f0952dd2000
xc_dom_alloc_segment:   phys2mach    : 0xffffffff925c7000 -> 0xffffffffa25c7000  (pfn 0x125c7 + 0x10000 pages)
xc_dom_pfn_to_ptr: domU mapping: pfn 0x125c7+0x10000 at 0x7f0942dd2000
xc_dom_alloc_page   :   start info   : 0xffffffffa25c7000 (pfn 0x225c7)
xc_dom_alloc_page   :   xenstore     : 0xffffffffa25c8000 (pfn 0x225c8)
xc_dom_alloc_page   :   console      : 0xffffffffa25c9000 (pfn 0x225c9)
nr_page_tables: 0x0000ffffffffffff/48: 0xffff000000000000 -> 0xffffffffffffffff, 1 table(s)
nr_page_tables: 0x0000007fffffffff/39: 0xffffff8000000000 -> 0xffffffffffffffff, 1 table(s)
nr_page_tables: 0x000000003fffffff/30: 0xffffffff80000000 -> 0xffffffffbfffffff, 1 table(s)
nr_page_tables: 0x00000000001fffff/21: 0xffffffff80000000 -> 0xffffffffa27fffff, 276 table(s)
xc_dom_alloc_segment:   page tables  : 0xffffffffa25ca000 -> 0xffffffffa26e1000  (pfn 0x225ca + 0x117 pages)
xc_dom_pfn_to_ptr: domU mapping: pfn 0x225ca+0x117 at 0x7f097d7a8000
xc_dom_alloc_page   :   boot stack   : 0xffffffffa26e1000 (pfn 0x226e1)
xc_dom_build_image  : virt_alloc_end : 0xffffffffa26e2000
xc_dom_build_image  : virt_pgtab_end : 0xffffffffa2800000

So the physical memory and virtual (using __START_KERNEL_map addresses)
layout looks as so:

  phys                             __ka
/------------\                   /-------------------\
| 0          | empty             | 0xffffffff80000000|
| ..         |                   | ..                |
| 16MB       | <= kernel starts  | 0xffffffff81000000|
| ..         |                   |                   |
| 30MB       | <= kernel ends => | 0xffffffff81e43000|
| ..         |  & ramdisk starts | ..                |
| 293MB      | <= ramdisk ends=> | 0xffffffff925c7000|
| ..         |  & P2M starts     | ..                |
| ..         |                   | ..                |
| 549MB      | <= P2M ends    => | 0xffffffffa25c7000|
| ..         | start_info        | 0xffffffffa25c7000|
| ..         | xenstore          | 0xffffffffa25c8000|
| ..         | cosole            | 0xffffffffa25c9000|
| 549MB      | <= page tables => | 0xffffffffa25ca000|
| ..         |                   |                   |
| 550MB      | <= PGT end     => | 0xffffffffa26e1000|
| ..         | boot stack        |                   |
\------------/                   \-------------------/

As can be seen, the ramdisk, P2M and pagetables are taking
a bit of __ka addresses space. Which is a problem since the
MODULES_VADDR starts at 0xffffffffa0000000 - and P2M sits
right in there! This results during bootup with the inability to
load modules, with this error:

------------[ cut here ]------------
WARNING: at /home/konrad/ssd/linux/mm/vmalloc.c:106 vmap_page_range_noflush+0x2d9/0x370()
Call Trace:
 [<ffffffff810719fa>] warn_slowpath_common+0x7a/0xb0
 [<ffffffff81030279>] ? __raw_callee_save_xen_pmd_val+0x11/0x1e
 [<ffffffff81071a45>] warn_slowpath_null+0x15/0x20
 [<ffffffff81130b89>] vmap_page_range_noflush+0x2d9/0x370
 [<ffffffff81130c4d>] map_vm_area+0x2d/0x50
 [<ffffffff811326d0>] __vmalloc_node_range+0x160/0x250
 [<ffffffff810c5369>] ? module_alloc_update_bounds+0x19/0x80
 [<ffffffff810c6186>] ? load_module+0x66/0x19c0
 [<ffffffff8105cadc>] module_alloc+0x5c/0x60
 [<ffffffff810c5369>] ? module_alloc_update_bounds+0x19/0x80
 [<ffffffff810c5369>] module_alloc_update_bounds+0x19/0x80
 [<ffffffff810c70c3>] load_module+0xfa3/0x19c0
 [<ffffffff812491f6>] ? security_file_permission+0x86/0x90
 [<ffffffff810c7b3a>] sys_init_module+0x5a/0x220
 [<ffffffff815ce339>] system_call_fastpath+0x16/0x1b
---[ end trace fd8f7704fdea0291 ]---
vmalloc: allocation failure, allocated 16384 of 20480 bytes
modprobe: page allocation failure: order:0, mode:0xd2

Since the __va and __ka are 1:1 up to MODULES_VADDR and
cleanup_highmap rids __ka of the ramdisk mapping, what
we want to do is similar - get rid of the P2M in the __ka
address space. There are two ways of fixing this:

 1) All P2M lookups instead of using the __ka address would
    use the __va address. This means we can safely erase from
    __ka space the PMD pointers that point to the PFNs for
    P2M array and be OK.
 2). Allocate a new array, copy the existing P2M into it,
    revector the P2M tree to use that, and return the old
    P2M to the memory allocate. This has the advantage that
    it sets the stage for using XEN_ELF_NOTE_INIT_P2M
    feature. That feature allows us to set the exact virtual
    address space we want for the P2M - and allows us to
    boot as initial domain on large machines.

So we pick option 2).

This patch only lays the groundwork in the P2M code. The patch
that modifies the MMU is called "xen/mmu: Copy and revector the P2M tree."

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/p2m.c     | 70 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/xen/xen-ops.h |  1 +
 2 files changed, 71 insertions(+)

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index e4adbfbdfada..996ee2bf7bdb 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -389,7 +389,77 @@ void __init xen_build_dynamic_phys_to_machine(void)
 
 	m2p_override_init();
 }
+#ifdef CONFIG_X86_64
+#include <linux/bootmem.h>
+unsigned long __init xen_revector_p2m_tree(void)
+{
+	unsigned long va_start;
+	unsigned long va_end;
+	unsigned long pfn;
+	unsigned long *mfn_list = NULL;
+	unsigned long size;
 
+	va_start = xen_start_info->mfn_list;
+	/*We copy in increments of P2M_PER_PAGE * sizeof(unsigned long),
+	 * so make sure it is rounded up to that */
+	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+	va_end = va_start + size;
+
+	/* If we were revectored already, don't do it again. */
+	if (va_start <= __START_KERNEL_map && va_start >= __PAGE_OFFSET)
+		return 0;
+
+	mfn_list = alloc_bootmem_align(size, PAGE_SIZE);
+	if (!mfn_list) {
+		pr_warn("Could not allocate space for a new P2M tree!\n");
+		return xen_start_info->mfn_list;
+	}
+	/* Fill it out with INVALID_P2M_ENTRY value */
+	memset(mfn_list, 0xFF, size);
+
+	for (pfn = 0; pfn < ALIGN(MAX_DOMAIN_PAGES, P2M_PER_PAGE); pfn += P2M_PER_PAGE) {
+		unsigned topidx = p2m_top_index(pfn);
+		unsigned mididx;
+		unsigned long *mid_p;
+
+		if (!p2m_top[topidx])
+			continue;
+
+		if (p2m_top[topidx] == p2m_mid_missing)
+			continue;
+
+		mididx = p2m_mid_index(pfn);
+		mid_p = p2m_top[topidx][mididx];
+		if (!mid_p)
+			continue;
+		if ((mid_p == p2m_missing) || (mid_p == p2m_identity))
+			continue;
+
+		if ((unsigned long)mid_p == INVALID_P2M_ENTRY)
+			continue;
+
+		/* The old va. Rebase it on mfn_list */
+		if (mid_p >= (unsigned long *)va_start && mid_p <= (unsigned long *)va_end) {
+			unsigned long *new;
+
+			new = &mfn_list[pfn];
+
+			copy_page(new, mid_p);
+			p2m_top[topidx][mididx] = &mfn_list[pfn];
+			p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn]);
+
+		}
+		/* This should be the leafs allocated for identity from _brk. */
+	}
+	return (unsigned long)mfn_list;
+
+}
+#else
+unsigned long __init xen_revector_p2m_tree(void)
+{
+	return 0;
+}
+#endif
 unsigned long get_phys_to_machine(unsigned long pfn)
 {
 	unsigned topidx, mididx, idx;
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 2230f57a6ebe..bb5a8105ea86 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -45,6 +45,7 @@ void xen_hvm_init_shared_info(void);
 void xen_unplug_emulated_devices(void);
 
 void __init xen_build_dynamic_phys_to_machine(void);
+unsigned long __init xen_revector_p2m_tree(void);
 
 void xen_init_irq_ops(void);
 void xen_setup_timer(int cpu);

From 7f9140626c757b773624b97865cb53c2a8348a69 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 26 Jul 2012 12:47:40 -0400
Subject: [PATCH 11/15] xen/mmu: Copy and revector the P2M tree.

Please first read the description in "xen/p2m: Add logic to revector a
P2M tree to use __va leafs" patch.

The 'xen_revector_p2m_tree()' function allocates a new P2M tree
copies the contents of the old one in it, and returns the new one.

At this stage, the __ka address space (which is what the old
P2M tree was using) is partially disassembled. The cleanup_highmap
has removed the PMD entries from 0-16MB and anything past _brk_end
up to the max_pfn_mapped (which is the end of the ramdisk).

We have revectored the P2M tree (and the one for save/restore as well)
to use new shiny __va address to new MFNs. The xen_start_info
has been taken care of already in 'xen_setup_kernel_pagetable()' and
xen_start_info->shared_info in 'xen_setup_shared_info()', so
we are free to roam and delete PMD entries - which is exactly what
we are going to do. We rip out the __ka for the old P2M array.

[v1: Fix smatch warnings]
[v2: memset was doing 0 instead of 0xff]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index b44e6a88ea74..a640949f78d4 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1183,9 +1183,64 @@ static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
 
 static void xen_post_allocator_init(void);
 
+#ifdef CONFIG_X86_64
+static void __init xen_cleanhighmap(unsigned long vaddr,
+				    unsigned long vaddr_end)
+{
+	unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
+	pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr);
+
+	/* NOTE: The loop is more greedy than the cleanup_highmap variant.
+	 * We include the PMD passed in on _both_ boundaries. */
+	for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE));
+			pmd++, vaddr += PMD_SIZE) {
+		if (pmd_none(*pmd))
+			continue;
+		if (vaddr < (unsigned long) _text || vaddr > kernel_end)
+			set_pmd(pmd, __pmd(0));
+	}
+	/* In case we did something silly, we should crash in this function
+	 * instead of somewhere later and be confusing. */
+	xen_mc_flush();
+}
+#endif
 static void __init xen_pagetable_setup_done(pgd_t *base)
 {
+#ifdef CONFIG_X86_64
+	unsigned long size;
+	unsigned long addr;
+#endif
+
 	xen_setup_shared_info();
+#ifdef CONFIG_X86_64
+	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		unsigned long new_mfn_list;
+
+		size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+
+		/* On 32-bit, we get zero so this never gets executed. */
+		new_mfn_list = xen_revector_p2m_tree();
+		if (new_mfn_list && new_mfn_list != xen_start_info->mfn_list) {
+			/* using __ka address and sticking INVALID_P2M_ENTRY! */
+			memset((void *)xen_start_info->mfn_list, 0xff, size);
+
+			/* We should be in __ka space. */
+			BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
+			addr = xen_start_info->mfn_list;
+			size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+			/* We roundup to the PMD, which means that if anybody at this stage is
+			 * using the __ka address of xen_start_info or xen_start_info->shared_info
+			 * they are in going to crash. Fortunatly we have already revectored
+			 * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
+			size = roundup(size, PMD_SIZE);
+			xen_cleanhighmap(addr, addr + size);
+
+			memblock_free(__pa(xen_start_info->mfn_list), size);
+			/* And revector! Bye bye old array */
+			xen_start_info->mfn_list = new_mfn_list;
+		}
+	}
+#endif
 	xen_post_allocator_init();
 }
 
@@ -1824,6 +1879,8 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 
 	/* Our (by three pages) smaller Xen pagetable that we are using */
 	memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE);
+	/* Revector the xen_start_info */
+	xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
 }
 #else	/* !CONFIG_X86_64 */
 static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);

From 3aca7fbc8ede0dd194317b2e3144815128ffb232 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 14 Aug 2012 14:34:00 -0400
Subject: [PATCH 12/15] xen/mmu: Remove from __ka space PMD entries for
 pagetables.

Please first read the description in "xen/mmu: Copy and revector the
P2M tree."

At this stage, the __ka address space (which is what the old
P2M tree was using) is partially disassembled. The cleanup_highmap
has removed the PMD entries from 0-16MB and anything past _brk_end
up to the max_pfn_mapped (which is the end of the ramdisk).

The xen_remove_p2m_tree and code around has ripped out the __ka for
the old P2M array.

Here we continue on doing it to where the Xen page-tables were.
It is safe to do it, as the page-tables are addressed using __va.
For good measure we delete anything that is within MODULES_VADDR
and up to the end of the PMD.

At this point the __ka only contains PMD entries for the start
of the kernel up to __brk.

[v1: Per Stefano's suggestion wrapped the MODULES_VADDR in debug]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index a640949f78d4..3f8e963b76c0 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1240,6 +1240,25 @@ static void __init xen_pagetable_setup_done(pgd_t *base)
 			xen_start_info->mfn_list = new_mfn_list;
 		}
 	}
+	/* At this stage, cleanup_highmap has already cleaned __ka space
+	 * from _brk_limit way up to the max_pfn_mapped (which is the end of
+	 * the ramdisk). We continue on, erasing PMD entries that point to page
+	 * tables - do note that they are accessible at this stage via __va.
+	 * For good measure we also round up to the PMD - which means that if
+	 * anybody is using __ka address to the initial boot-stack - and try
+	 * to use it - they are going to crash. The xen_start_info has been
+	 * taken care of already in xen_setup_kernel_pagetable. */
+	addr = xen_start_info->pt_base;
+	size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE);
+
+	xen_cleanhighmap(addr, addr + size);
+	xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
+#ifdef DEBUG
+	/* This is superflous and is not neccessary, but you know what
+	 * lets do it. The MODULES_VADDR -> MODULES_END should be clear of
+	 * anything at this stage. */
+	xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
+#endif
 #endif
 	xen_post_allocator_init();
 }

From 785f62314984ea3af9dd830b020289ba2509ae69 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 14 Aug 2012 16:37:31 -0400
Subject: [PATCH 13/15] xen/mmu: Release just the MFN list, not MFN list and
 part of pagetables.

We call memblock_reserve for [start of mfn list] -> [PMD aligned end
of mfn list] instead of <start of mfn list> -> <page aligned end of mfn list].

This has the disastrous effect that if at bootup the end of mfn_list is
not PMD aligned we end up returning to memblock parts of the region
past the mfn_list array. And those parts are the PTE tables with
the disastrous effect of seeing this at bootup:

Write protecting the kernel read-only data: 10240k
Freeing unused kernel memory: 1860k freed
Freeing unused kernel memory: 200k freed
(XEN) mm.c:2429:d0 Bad type (saw 1400000000000002 != exp 7000000000000000) for mfn 116a80 (pfn 14e26)
...
(XEN) mm.c:908:d0 Error getting mfn 116a83 (pfn 14e2a) from L1 entry 8000000116a83067 for l1e_owner=0, pg_owner=0
(XEN) mm.c:908:d0 Error getting mfn 4040 (pfn 5555555555555555) from L1 entry 0000000004040601 for l1e_owner=0, pg_owner=0
.. and so on.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3f8e963b76c0..5b2cb54425ce 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1227,7 +1227,6 @@ static void __init xen_pagetable_setup_done(pgd_t *base)
 			/* We should be in __ka space. */
 			BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
 			addr = xen_start_info->mfn_list;
-			size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
 			/* We roundup to the PMD, which means that if anybody at this stage is
 			 * using the __ka address of xen_start_info or xen_start_info->shared_info
 			 * they are in going to crash. Fortunatly we have already revectored
@@ -1235,6 +1234,7 @@ static void __init xen_pagetable_setup_done(pgd_t *base)
 			size = roundup(size, PMD_SIZE);
 			xen_cleanhighmap(addr, addr + size);
 
+			size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
 			memblock_free(__pa(xen_start_info->mfn_list), size);
 			/* And revector! Bye bye old array */
 			xen_start_info->mfn_list = new_mfn_list;

From 3fc509fc0c590900568ef516a37101d88f3476f5 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 16 Aug 2012 16:38:55 -0400
Subject: [PATCH 14/15] xen/p2m: When revectoring deal with holes in the P2M
 array.

When we free the PFNs and then subsequently populate them back
during bootup:

Freeing 20000-20200 pfn range: 512 pages freed
1-1 mapping on 20000->20200
Freeing 40000-40200 pfn range: 512 pages freed
1-1 mapping on 40000->40200
Freeing bad80-badf4 pfn range: 116 pages freed
1-1 mapping on bad80->badf4
Freeing badf6-bae7f pfn range: 137 pages freed
1-1 mapping on badf6->bae7f
Freeing bb000-100000 pfn range: 282624 pages freed
1-1 mapping on bb000->100000
Released 283999 pages of unused memory
Set 283999 page(s) to 1-1 mapping
Populating 1acb8a-1f20e9 pfn range: 283999 pages added

We end up having the P2M array (that is the one that was
grafted on the P2M tree) filled with IDENTITY_FRAME or
INVALID_P2M_ENTRY) entries. The patch titled

"xen/p2m: Reuse existing P2M leafs if they are filled with 1:1 PFNs or INVALID."
recycles said slots and replaces the P2M tree leaf's with
 &mfn_list[xx] with p2m_identity or p2m_missing.

And re-uses the P2M array sections for other P2M tree leaf's.
For the above mentioned bootup excerpt, the PFNs at
0x20000->0x20200 are going to be IDENTITY based:

P2M[0][256][0] -> P2M[0][257][0] get turned in IDENTITY_FRAME.

We can re-use that and replace P2M[0][256] to point to p2m_identity.
The "old" page (the grafted P2M array provided by Xen) that was at
P2M[0][256] gets put somewhere else. Specifically at P2M[6][358],
b/c when we populate back:

Populating 1acb8a-1f20e9 pfn range: 283999 pages added

we fill P2M[6][358][0] (and P2M[6][358], P2M[6][359], ...) with
the new MFNs.

That is all OK, except when we revector we assume that the PFN
count would be the same in the grafted P2M array and in the
newly allocated. Since that is no longer the case, as we have
holes in the P2M that point to p2m_missing or p2m_identity we
have to take that into account.

[v2: Check for overflow]
[v3: Move within the __va check]
[v4: Fix the computation]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/p2m.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 996ee2bf7bdb..c3e92912c3fb 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -396,6 +396,7 @@ unsigned long __init xen_revector_p2m_tree(void)
 	unsigned long va_start;
 	unsigned long va_end;
 	unsigned long pfn;
+	unsigned long pfn_free = 0;
 	unsigned long *mfn_list = NULL;
 	unsigned long size;
 
@@ -442,11 +443,18 @@ unsigned long __init xen_revector_p2m_tree(void)
 		if (mid_p >= (unsigned long *)va_start && mid_p <= (unsigned long *)va_end) {
 			unsigned long *new;
 
-			new = &mfn_list[pfn];
+			if (pfn_free  > (size / sizeof(unsigned long))) {
+				WARN(1, "Only allocated for %ld pages, but we want %ld!\n",
+				     size / sizeof(unsigned long), pfn_free);
+				return 0;
+			}
+			new = &mfn_list[pfn_free];
 
 			copy_page(new, mid_p);
-			p2m_top[topidx][mididx] = &mfn_list[pfn];
-			p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn]);
+			p2m_top[topidx][mididx] = &mfn_list[pfn_free];
+			p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn_free]);
+
+			pfn_free += P2M_PER_PAGE;
 
 		}
 		/* This should be the leafs allocated for identity from _brk. */

From 328731876451a837f56e66ffa11de053ed5daf73 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 17 Aug 2012 09:35:31 -0400
Subject: [PATCH 15/15] xen/mmu: If the revector fails, don't attempt to
 revector anything else.

If the P2M revectoring would fail, we would try to continue on by
cleaning the PMD for L1 (PTE) page-tables. The xen_cleanhighmap
is greedy and erases the PMD on both boundaries. Since the P2M
array can share the PMD, we would wipe out part of the __ka
that is still used in the P2M tree to point to P2M leafs.

This fixes it by bypassing the revectoring and continuing on.
If the revector fails, a nice WARN is printed so we can still
troubleshoot this.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 5b2cb54425ce..cb9db72b33f8 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1238,7 +1238,8 @@ static void __init xen_pagetable_setup_done(pgd_t *base)
 			memblock_free(__pa(xen_start_info->mfn_list), size);
 			/* And revector! Bye bye old array */
 			xen_start_info->mfn_list = new_mfn_list;
-		}
+		} else
+			goto skip;
 	}
 	/* At this stage, cleanup_highmap has already cleaned __ka space
 	 * from _brk_limit way up to the max_pfn_mapped (which is the end of
@@ -1259,6 +1260,7 @@ static void __init xen_pagetable_setup_done(pgd_t *base)
 	 * anything at this stage. */
 	xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
 #endif
+skip:
 #endif
 	xen_post_allocator_init();
 }