pagemap: fix pfn calculation for hugepage

Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]
From: Linux Kernel Mailing List
Date: Wednesday, April 7, 2010 - 9:59 am

Gitweb:     http://git.kernel.org/linus/116354d177ba2da37e91cf884e3d11e67f825efd
Commit:     116354d177ba2da37e91cf884e3d11e67f825efd
Parent:     57119c34e53bbb8d244c3ff1335ef5145768538f
Author:     Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
AuthorDate: Tue Apr 6 14:35:04 2010 -0700
Committer:  Linus Torvalds <torvalds@linux-foundation.org>
CommitDate: Wed Apr 7 08:38:04 2010 -0700

    pagemap: fix pfn calculation for hugepage
    
    When we look into pagemap using page-types with option -p, the value of
    pfn for hugepages looks wrong (see below.) This is because pte was
    evaluated only once for one vma although it should be updated for each
    hugepage.  This patch fixes it.
    
      $ page-types -p 3277 -Nl -b huge
      voffset   offset  len     flags
      7f21e8a00 11e400  1       ___U___________H_G________________
      7f21e8a01 11e401  1ff     ________________TG________________
                   ^^^
      7f21e8c00 11e400  1       ___U___________H_G________________
      7f21e8c01 11e401  1ff     ________________TG________________
                   ^^^
    
    One hugepage contains 1 head page and 511 tail pages in x86_64 and each
    two lines represent each hugepage.  Voffset and offset mean virtual
    address and physical address in the page unit, respectively.  The
    different hugepages should not have the same offset value.
    
    With this patch applied:
    
      $ page-types -p 3386 -Nl -b huge
      voffset   offset   len    flags
      7fec7a600 112c00   1      ___UD__________H_G________________
      7fec7a601 112c01   1ff    ________________TG________________
                   ^^^
      7fec7a800 113200   1      ___UD__________H_G________________
      7fec7a801 113201   1ff    ________________TG________________
                   ^^^
                   OK
    
    More info:
    
    - This patch modifies walk_page_range()'s hugepage walker.  But the
      change only affects pagemap_read(), which is the only caller of hugepage
      callback.
    
    - Without this patch, hugetlb_entry() callback is called per vma, that
      doesn't match the natural expectation from its name.
    
    - With this patch, hugetlb_entry() is called per hugepte entry and the
      callback can become much simpler.
    
    Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
    Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
    Acked-by: Matt Mackall <mpm@selenic.com>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c |   27 +++++++--------------------
 include/linux/mm.h |    4 ++--
 mm/pagewalk.c      |   47 +++++++++++++++++++++++++++++++++++++----------
 3 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a05a669..0705534 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -662,31 +662,18 @@ static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
 	return pme;
 }
 
-static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
-				 unsigned long end, struct mm_walk *walk)
+/* This function walks within one hugetlb entry in the single call */
+static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
+				 unsigned long addr, unsigned long end,
+				 struct mm_walk *walk)
 {
-	struct vm_area_struct *vma;
 	struct pagemapread *pm = walk->private;
-	struct hstate *hs = NULL;
 	int err = 0;
+	u64 pfn;
 
-	vma = find_vma(walk->mm, addr);
-	if (vma)
-		hs = hstate_vma(vma);
 	for (; addr != end; addr += PAGE_SIZE) {
-		u64 pfn = PM_NOT_PRESENT;
-
-		if (vma && (addr >= vma->vm_end)) {
-			vma = find_vma(walk->mm, addr);
-			if (vma)
-				hs = hstate_vma(vma);
-		}
-
-		if (vma && (vma->vm_start <= addr) && is_vm_hugetlb_page(vma)) {
-			/* calculate pfn of the "raw" page in the hugepage. */
-			int offset = (addr & ~huge_page_mask(hs)) >> PAGE_SHIFT;
-			pfn = huge_pte_to_pagemap_entry(*pte, offset);
-		}
+		int offset = (addr & ~hmask) >> PAGE_SHIFT;
+		pfn = huge_pte_to_pagemap_entry(*pte, offset);
 		err = add_to_pagemap(addr, pfn, pm);
 		if (err)
 			return err;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e70f21b..462acaf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -783,8 +783,8 @@ struct mm_walk {
 	int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, struct mm_walk *);
 	int (*pte_entry)(pte_t *, unsigned long, unsigned long, struct mm_walk *);
 	int (*pte_hole)(unsigned long, unsigned long, struct mm_walk *);
-	int (*hugetlb_entry)(pte_t *, unsigned long, unsigned long,
-			     struct mm_walk *);
+	int (*hugetlb_entry)(pte_t *, unsigned long,
+			     unsigned long, unsigned long, struct mm_walk *);
 	struct mm_struct *mm;
 	void *private;
 };
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 7b47a57..8b1a2ce 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -80,6 +80,37 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 	return err;
 }
 
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
+				       unsigned long end)
+{
+	unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
+	return boundary < end ? boundary : end;
+}
+
+static int walk_hugetlb_range(struct vm_area_struct *vma,
+			      unsigned long addr, unsigned long end,
+			      struct mm_walk *walk)
+{
+	struct hstate *h = hstate_vma(vma);
+	unsigned long next;
+	unsigned long hmask = huge_page_mask(h);
+	pte_t *pte;
+	int err = 0;
+
+	do {
+		next = hugetlb_entry_end(h, addr, end);
+		pte = huge_pte_offset(walk->mm, addr & hmask);
+		if (pte && walk->hugetlb_entry)
+			err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
+		if (err)
+			return err;
+	} while (addr = next, addr != end);
+
+	return 0;
+}
+#endif
+
 /**
  * walk_page_range - walk a memory map's page tables with a callback
  * @mm: memory map to walk
@@ -128,20 +159,16 @@ int walk_page_range(unsigned long addr, unsigned long end,
 		vma = find_vma(walk->mm, addr);
 #ifdef CONFIG_HUGETLB_PAGE
 		if (vma && is_vm_hugetlb_page(vma)) {
-			pte_t *pte;
-			struct hstate *hs;
-
 			if (vma->vm_end < next)
 				next = vma->vm_end;
-			hs = hstate_vma(vma);
-			pte = huge_pte_offset(walk->mm,
-					      addr & huge_page_mask(hs));
-			if (pte && !huge_pte_none(huge_ptep_get(pte))
-			    && walk->hugetlb_entry)
-				err = walk->hugetlb_entry(pte, addr,
-							  next, walk);
+			/*
+			 * Hugepage is very tightly coupled with vma, so
+			 * walk through hugetlb entries within a given vma.
+			 */
+			err = walk_hugetlb_range(vma, addr, next, walk);
 			if (err)
 				break;
+			pgd = pgd_offset(walk->mm, next);
 			continue;
 		}
 #endif
--
To unsubscribe from this list: send the line "unsubscribe git-commits-head" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]

Messages in current thread:
pagemap: fix pfn calculation for hugepage, Linux Kernel Mailing ..., (Wed Apr 7, 9:59 am)