[PATCH 17 of 36] x86: preallocate and prepopulate separately

!MAILaRCHIVE_VOTE_RePLACE
Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]
To: Ingo Molnar <mingo@...>
Cc: LKML <linux-kernel@...>, <x86@...>, xen-devel <xen-devel@...>, Stephen Tweedie <sct@...>, Eduardo Habkost <ehabkost@...>, Mark McLoughlin <markmc@...>
Date: Wednesday, June 25, 2008 - 12:19 am

Jan Beulich points out that vmalloc_sync_all() assumes that the
kernel's pmd is always expected to be present in the pgd.  The current
pgd construction code will add the pgd to the pgd_list before its pmds
have been pre-populated, thereby making it visible to
vmalloc_sync_all().

However, because pgd_prepopulate_pmd also does the allocation, it may
block and cannot be done under spinlock.

The solution is to preallocate the pmds out of the spinlock, then
populate them while holding the pgd_list lock.

This patch also pulls the pmd preallocation and mop-up functions out
to be common, assuming that the compiler will generate no code for
them when PREALLOCTED_PMDS is 0.  Also, there's no need for pgd_ctor
to clear the pgd again, since it's allocated as a zeroed page.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Jan Beulich <jbeulich@novell.com>
---
 arch/x86/mm/pgtable.c |  177 +++++++++++++++++++++++++++++--------------------
 1 file changed, 105 insertions(+), 72 deletions(-)

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -66,12 +66,6 @@
 static void pgd_ctor(void *p)
 {
 	pgd_t *pgd = p;
-	unsigned long flags;
-
-	/* Clear usermode parts of PGD */
-	memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
-
-	spin_lock_irqsave(&pgd_lock, flags);
 
 	/* If the pgd points to a shared pagetable level (either the
 	   ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -91,8 +85,6 @@
 	/* list required to sync kernel mapping updates */
 	if (!SHARED_KERNEL_PMD)
 		pgd_list_add(pgd);
-
-	spin_unlock_irqrestore(&pgd_lock, flags);
 }
 
 static void pgd_dtor(void *pgd)
@@ -120,30 +112,6 @@
 
 #ifdef CONFIG_X86_PAE
 /*
- * Mop up any pmd pages which may still be attached to the pgd.
- * Normally they will be freed by munmap/exit_mmap, but any pmd we
- * preallocate which never got a corresponding vma will need to be
- * freed manually.
- */
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
-{
-	int i;
-
-	for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
-		pgd_t pgd = pgdp[i];
-
-		if (pgd_val(pgd) != 0) {
-			pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
-
-			pgdp[i] = native_make_pgd(0);
-
-			paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
-			pmd_free(mm, pmd);
-		}
-	}
-}
-
-/*
  * In PAE mode, we need to do a cr3 reload (=tlb flush) when
  * updating the top-level pagetable entries to guarantee the
  * processor notices the update.  Since this is expensive, and
@@ -154,31 +122,7 @@
  * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
  * and initialize the kernel pmds here.
  */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-{
-	pud_t *pud;
-	unsigned long addr;
-	int i;
-
-	pud = pud_offset(pgd, 0);
- 	for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
-	     i++, pud++, addr += PUD_SIZE) {
-		pmd_t *pmd = pmd_alloc_one(mm, addr);
-
-		if (!pmd) {
-			pgd_mop_up_pmds(mm, pgd);
-			return 0;
-		}
-
-		if (i >= KERNEL_PGD_BOUNDARY)
-			memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
-			       sizeof(pmd_t) * PTRS_PER_PMD);
-
-		pud_populate(mm, pud, pmd);
-	}
-
-	return 1;
-}
+#define PREALLOCATED_PMDS	UNSHARED_PTRS_PER_PGD
 
 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
 {
@@ -198,35 +142,124 @@
 		write_cr3(read_cr3());
 }
 #else  /* !CONFIG_X86_PAE */
+
 /* No need to prepopulate any pagetable entries in non-PAE modes. */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+#define PREALLOCATED_PMDS	0
+
+#endif	/* CONFIG_X86_PAE */
+
+static void free_pmds(pmd_t *pmds[])
 {
-	return 1;
+	int i;
+
+	for(i = 0; i < PREALLOCATED_PMDS; i++)
+		if (pmds[i])
+			free_page((unsigned long)pmds[i]);
 }
 
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
+static int preallocate_pmds(pmd_t *pmds[])
 {
+	int i;
+	bool failed = false;
+
+	for(i = 0; i < PREALLOCATED_PMDS; i++) {
+		pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+		if (pmd == NULL)
+			failed = true;
+		pmds[i] = pmd;
+	}
+
+	if (failed) {
+		free_pmds(pmds);
+		return -ENOMEM;
+	}
+
+	return 0;
 }
-#endif	/* CONFIG_X86_PAE */
+
+/*
+ * Mop up any pmd pages which may still be attached to the pgd.
+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
+ * preallocate which never got a corresponding vma will need to be
+ * freed manually.
+ */
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
+{
+	int i;
+
+	for(i = 0; i < PREALLOCATED_PMDS; i++) {
+		pgd_t pgd = pgdp[i];
+
+		if (pgd_val(pgd) != 0) {
+			pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+			pgdp[i] = native_make_pgd(0);
+
+			paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
+			pmd_free(mm, pmd);
+		}
+	}
+}
+
+static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
+{
+	pud_t *pud;
+	unsigned long addr;
+	int i;
+
+	pud = pud_offset(pgd, 0);
+
+ 	for (addr = i = 0; i < PREALLOCATED_PMDS;
+	     i++, pud++, addr += PUD_SIZE) {
+		pmd_t *pmd = pmds[i];
+
+		if (i >= KERNEL_PGD_BOUNDARY)
+			memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
+			       sizeof(pmd_t) * PTRS_PER_PMD);
+
+		pud_populate(mm, pud, pmd);
+	}
+}
 
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+	pgd_t *pgd;
+	pmd_t *pmds[PREALLOCATED_PMDS];
+	unsigned long flags;
 
-	/* so that alloc_pmd can use it */
+	pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+
+	if (pgd == NULL)
+		goto out;
+
 	mm->pgd = pgd;
-	if (pgd) {
-		pgd_ctor(pgd);
 
-		if (paravirt_pgd_alloc(mm) != 0 ||
-		    !pgd_prepopulate_pmd(mm, pgd)) {
-			pgd_dtor(pgd);
-			free_page((unsigned long)pgd);
-			pgd = NULL;
-		}
-	}
+	if (preallocate_pmds(pmds) != 0)
+		goto out_free_pgd;
+
+	if (paravirt_pgd_alloc(mm) != 0)
+		goto out_free_pmds;
+
+	/*
+	 * Make sure that pre-populating the pmds is atomic with
+	 * respect to anything walking the pgd_list, so that they
+	 * never see a partially populated pgd.
+	 */
+	spin_lock_irqsave(&pgd_lock, flags);
+
+	pgd_ctor(pgd);
+	pgd_prepopulate_pmd(mm, pgd, pmds);
+
+	spin_unlock_irqrestore(&pgd_lock, flags);
 
 	return pgd;
+
+out_free_pmds:
+	free_pmds(pmds);
+out_free_pgd:
+	free_page((unsigned long)pgd);
+out:
+	return NULL;
 }
 
 void pgd_free(struct mm_struct *mm, pgd_t *pgd)


--
Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]

Messages in current thread:
[PATCH 00 of 36] x86/paravirt: groundwork for 64-bit Xen sup..., Jeremy Fitzhardinge, (Wed Jun 25, 12:18 am)
Re: [PATCH 00 of 36] x86/paravirt: groundwork for 64-bit Xen..., Jeremy Fitzhardinge, (Wed Jun 25, 4:03 pm)
Re: [PATCH 00 of 36] x86/paravirt: groundwork for 64-bit Xen..., Jeremy Fitzhardinge, (Wed Jun 25, 4:12 pm)
Re: [PATCH 00 of 36] x86/paravirt: groundwork for 64-bit Xen..., Jeremy Fitzhardinge, (Thu Jun 26, 3:02 pm)
Re: [PATCH 00 of 36] x86/paravirt: groundwork for 64-bit Xen..., Jeremy Fitzhardinge, (Thu Jun 26, 2:25 pm)
Re: [PATCH 00 of 36] x86/paravirt: groundwork for 64-bit Xen..., Jeremy Fitzhardinge, (Thu Jun 26, 10:28 am)
Re: [Xen-devel] Re: [PATCH 00 of 36] x86/paravirt: groundwor..., Jeremy Fitzhardinge, (Thu Jun 26, 10:34 am)
Re: [Xen-devel] Re: [PATCH 00 of 36] x86/paravirt: groundwor..., Jeremy Fitzhardinge, (Fri Jun 27, 3:04 pm)
Re: [Xen-devel] Re: [PATCH 00 of 36] x86/paravirt: groundwor..., Jeremy Fitzhardinge, (Sun Jun 29, 11:02 pm)
Re: [Xen-devel] Re: [PATCH 00 of 36] x86/paravirt: groundwor..., Jeremy Fitzhardinge, (Mon Jun 30, 7:04 pm)
Re: [Xen-devel] Re: [PATCH 00 of 36] x86/paravirt: groundwor..., Jeremy Fitzhardinge, (Tue Jul 1, 12:14 pm)
Re: [Xen-devel] Re: [PATCH 00 of 36] x86/paravirt: groundwor..., Jeremy Fitzhardinge, (Thu Jul 3, 2:25 pm)
Re: [Xen-devel] Re: [PATCH 00 of 36] x86/paravirt: groundwor..., Jeremy Fitzhardinge, (Thu Jul 3, 2:41 pm)
Re: [Xen-devel] Re: [PATCH 00 of 36] x86/paravirt: groundwor..., Jeremy Fitzhardinge, (Thu Jul 3, 11:47 am)
Re: [Xen-devel] Re: [PATCH 00 of 36] x86/paravirt: groundwor..., Jeremy Fitzhardinge, (Tue Jul 1, 12:10 pm)
Re: [Xen-devel] Re: [PATCH 00 of 36] x86/paravirt: groundwor..., Jeremy Fitzhardinge, (Mon Jun 30, 1:57 pm)
Re: [Xen-devel] Re: [PATCH 00 of 36] x86/paravirt: groundwor..., Jeremy Fitzhardinge, (Mon Jun 30, 1:17 pm)
Re: [Xen-devel] Re: [PATCH 00 of 36] x86/paravirt: groundwor..., Jeremy Fitzhardinge, (Mon Jun 30, 2:36 pm)
Re: [Xen-devel] Re: [PATCH 00 of 36] x86/paravirt: groundwor..., Jeremy Fitzhardinge, (Mon Jun 30, 1:32 am)
Re: [Xen-devel] Re: [PATCH 00 of 36] x86/paravirt: groundwor..., Jeremy Fitzhardinge, (Fri Jun 27, 12:02 pm)
Re: [Xen-devel] Re: [PATCH 00 of 36] x86/paravirt: groundwor..., Jeremy Fitzhardinge, (Fri Jun 27, 12:25 pm)
Re: [PATCH 00 of 36] x86/paravirt: groundwork for 64-bit Xen..., Jeremy Fitzhardinge, (Wed Jun 25, 7:46 am)
[PATCH 36 of 36] x86_64/paravirt: Make load_gs_index() a par..., Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
Re: [PATCH 36 of 36] x86_64/paravirt: Make load_gs_index() a..., Jeremy Fitzhardinge, (Wed Jun 25, 7:48 am)
[PATCH 26 of 36] x86_64: Split set_pte_vaddr(), Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 35 of 36] x86_64/paravirt: add adjust_exception_frame, Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 04 of 36] x86: remove open-coded save/load segment op..., Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 25 of 36] x86_64: PSE no longer a hard requirement, Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 18 of 36] x86/paravirt: add debugging for missing ope..., Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 01 of 36] x86: asm-x86/pgtable.h: fix compiler warning, Jeremy Fitzhardinge, (Wed Jun 25, 12:18 am)
[PATCH 32 of 36] Add sysret/sysexit pvops for returning to 3..., Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 31 of 36] x86_64 pvops: don't restore user rsp within..., Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 06 of 36] x86_64: use p??_populate() to attach pages ..., Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 07 of 36] x86_64: unify early_ioremap, Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 19 of 36] paravirt_ops: define PARA_INDIRECT for indi..., Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 28 of 36] Save %fs and %gs before load_TLS() and arch..., Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 30 of 36] x86/paravirt_ops: split sysret and sysexit, Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 21 of 36] x86-64: add FIX_PARAVIRT_BOOTMAP fixmap slot, Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 20 of 36] paravirt/x86_64: move __PAGE_OFFSET to leav..., Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 34 of 36] x86_64: swapgs pvop with a user-stack can n..., Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 05 of 36] x86_64: use write_gdt_entry in vsyscall_set..., Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 33 of 36] x86_64: ia32entry: replace privileged instr..., Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 27 of 36] x86_64: __switch_to(): Move arch_leave_lazy..., Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 29 of 36] Use __KERNEL_DS as SS when returning to a k..., Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 24 of 36] x86_64: create small vmemmap mappings if PS..., Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 10 of 36] x86: unify pgd_index, Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 11 of 36] x86: unify mmu_context.h, Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 08 of 36] x86_64: Add gate_offset() and gate_segment(..., Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 09 of 36] x86_64: Use __pgd() on mk_kernel_pgd(), Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 23 of 36] x86_64: adjust mapping of physical pagetabl..., Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 14 of 36] x86_64: add sync_cmpxchg, Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 03 of 36] x86: add memory barriers to wrmsr, Jeremy Fitzhardinge, (Wed Jun 25, 12:18 am)
Re: [PATCH 03 of 36] x86: add memory barriers to wrmsr, Arjan van de Ven, (Wed Jun 25, 12:44 am)
Re: [PATCH 03 of 36] x86: add memory barriers to wrmsr, Jeremy Fitzhardinge, (Wed Jun 25, 5:08 pm)
Re: [PATCH 03 of 36] x86: add memory barriers to wrmsr, Arjan van de Ven, (Wed Jun 25, 6:31 pm)
Re: [PATCH 03 of 36] x86: add memory barriers to wrmsr, H. Peter Anvin, (Wed Jun 25, 7:18 pm)
Re: [PATCH 03 of 36] x86: add memory barriers to wrmsr, Jeremy Fitzhardinge, (Wed Jun 25, 7:37 pm)
Re: [PATCH 03 of 36] x86: add memory barriers to wrmsr, H. Peter Anvin, (Wed Jun 25, 7:42 pm)
Re: [PATCH 03 of 36] x86: add memory barriers to wrmsr, Jeremy Fitzhardinge, (Wed Jun 25, 7:05 pm)
[PATCH 13 of 36] x86_64: add prototype for x86_64_start_kern..., Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 12 of 36] x86_64: replace end_pfn with num_physpages, Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 17 of 36] x86: preallocate and prepopulate separately, Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 15 of 36] x86: simplify vmalloc_sync_all, Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 22 of 36] x86_64: split x86_64_start_kernel, Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 16 of 36] x86/paravirt: add a pgd_alloc/free hooks, Jeremy Fitzhardinge, (Wed Jun 25, 12:19 am)
[PATCH 02 of 36] x86: add memory clobber to save/loadsegment, Jeremy Fitzhardinge, (Wed Jun 25, 12:18 am)