Hi, This is the 3rd version of "hugepage migration" set. I rebased this onto 2.6.36-rc2 and merged many comments from you. In previous discussion, I explained why hugepage migration encounts no race with direct I/O without additional page locking. Based on that reasoning, I made no change on page locking on migration code (i.e. lock only head pages.) Future works: - Migration can fail for various reasons depending on various factors, so it's useful if soft offline can be retried when it noticed migration fails. This problem is a more general one because it's applied for soft offline of normal-sized pages. So we leave it as a future work. - Corrupted hugepage counter implemeted in the previous version was dropped because it's not directly related to migration topic and have no serious impact on kernel behavior. We also leave it as the next work. Summary: [PATCH 1/8] hugetlb: fix metadata corruption in hugetlb_fault() [PATCH 2/8] hugetlb: add allocate function for hugepage migration [PATCH 3/8] hugetlb: rename hugepage allocation functions [PATCH 4/8] hugetlb: redefine hugepage copy functions [PATCH 5/8] hugetlb: hugepage migration core [PATCH 6/8] HWPOISON, hugetlb: soft offlining for hugepage [PATCH 7/8] HWPOISON, hugetlb: fix unpoison for hugepage [PATCH 8/8] page-types.c: fix name of unpoison interface Documentation/vm/page-types.c | 2 +- fs/hugetlbfs/inode.c | 15 +++ include/linux/hugetlb.h | 9 ++ include/linux/migrate.h | 12 +++ mm/hugetlb.c | 216 ++++++++++++++++++++++++++++++++--------- mm/memory-failure.c | 65 +++++++++---- mm/migrate.c | 192 +++++++++++++++++++++++++++++++++---- mm/vmscan.c | 9 ++- 8 files changed, 434 insertions(+), 86 deletions(-) Thanks, Naoya Horiguchi --
The function name alloc_huge_page_no_vma_node() has verbose suffix "_no_vma".
This patch makes existing alloc_huge_page() and it's family have "_vma" instead,
which makes it easier to read.
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
---
include/linux/hugetlb.h | 4 ++--
mm/hugetlb.c | 20 ++++++++++----------
2 files changed, 12 insertions(+), 12 deletions(-)
diff --git v2.6.36-rc2/include/linux/hugetlb.h v2.6.36-rc2/include/linux/hugetlb.h
index 142bd4f..0b73c53 100644
--- v2.6.36-rc2/include/linux/hugetlb.h
+++ v2.6.36-rc2/include/linux/hugetlb.h
@@ -228,7 +228,7 @@ struct huge_bootmem_page {
struct hstate *hstate;
};
-struct page *alloc_huge_page_no_vma_node(struct hstate *h, int nid);
+struct page *alloc_huge_page_node(struct hstate *h, int nid);
/* arch callback */
int __init alloc_bootmem_huge_page(struct hstate *h);
@@ -305,7 +305,7 @@ static inline struct hstate *page_hstate(struct page *page)
#else
struct hstate {};
-#define alloc_huge_page_no_vma_node(h, nid) NULL
+#define alloc_huge_page_node(h, nid) NULL
#define alloc_bootmem_huge_page(h) NULL
#define hstate_file(f) NULL
#define hstate_vma(v) NULL
diff --git v2.6.36-rc2/mm/hugetlb.c v2.6.36-rc2/mm/hugetlb.c
index 31118d2..674a25e 100644
--- v2.6.36-rc2/mm/hugetlb.c
+++ v2.6.36-rc2/mm/hugetlb.c
@@ -666,7 +666,7 @@ static struct page *alloc_buddy_huge_page_node(struct hstate *h, int nid)
* E.g. soft-offlining uses this function because it only cares physical
* address of error page.
*/
-struct page *alloc_huge_page_no_vma_node(struct hstate *h, int nid)
+struct page *alloc_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
@@ -818,7 +818,7 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
return ret;
}
-static struct page *alloc_buddy_huge_page(struct hstate *h,
+static struct page *alloc_buddy_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
{
struct ...alloc_buddy_huge_page() doesn't make use of @vma at all, so the
parameters can be removed.
It looks cleaner to fold the
alloc_huge_page_no_vma_node=>alloc_huge_page_node renames into the
previous patch, from there split out the code refactor chunks into
a standalone patch, and then include this cleanup patch.
Thanks,
Fengguang
---
hugetlb: remove unused alloc_buddy_huge_page() parameters
alloc_buddy_huge_page() doesn't make use of @vma at all, so the
parameters can be removed.
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cc5be78..3114b4c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -770,8 +770,7 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
return ret;
}
-static struct page *alloc_buddy_huge_page(struct hstate *h,
- struct vm_area_struct *vma, unsigned long address)
+static struct page *alloc_buddy_huge_page(struct hstate *h)
{
struct page *page;
unsigned int nid;
@@ -871,7 +870,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
retry:
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
- page = alloc_buddy_huge_page(h, NULL, 0);
+ page = alloc_buddy_huge_page(h);
if (!page) {
/*
* We were not able to allocate enough pages to
@@ -1052,7 +1051,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
spin_unlock(&hugetlb_lock);
if (!page) {
- page = alloc_buddy_huge_page(h, vma, addr);
+ page = alloc_buddy_huge_page(h);
if (!page) {
hugetlb_put_quota(inode->i_mapping, chg);
return ERR_PTR(-VM_FAULT_SIGBUS);
--
When we unify alloc_buddy_huge_page() as commented for patch 2/8, _vma suffix is not necessary any more. Your suggestion to drop unused arguments sounds reasonable, so I merge the attached patch into patch 2/8. Thanks, Naoya --
This patch extends page migration code to support hugepage migration.
One of the potential users of this feature is soft offlining which
is triggered by memory corrected errors (added by the next patch.)
Todo:
- there are other users of page migration such as memory policy,
memory hotplug and memocy compaction.
They are not ready for hugepage support for now.
ChangeLog since v2:
- refactor isolate/putback_lru_page() to handle hugepage
- add comment about race on unmap_and_move_huge_page()
ChangeLog since v1:
- divide migration code path for hugepage
- define routine checking migration swap entry for hugetlb
- replace "goto" with "if/else" in remove_migration_pte()
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
---
fs/hugetlbfs/inode.c | 15 ++++
include/linux/migrate.h | 12 +++
mm/hugetlb.c | 18 ++++-
mm/migrate.c | 192 ++++++++++++++++++++++++++++++++++++++++++-----
mm/vmscan.c | 9 ++-
5 files changed, 225 insertions(+), 21 deletions(-)
diff --git v2.6.36-rc2/fs/hugetlbfs/inode.c v2.6.36-rc2/fs/hugetlbfs/inode.c
index 6e5bd42..1f7ca50 100644
--- v2.6.36-rc2/fs/hugetlbfs/inode.c
+++ v2.6.36-rc2/fs/hugetlbfs/inode.c
@@ -31,6 +31,7 @@
#include <linux/statfs.h>
#include <linux/security.h>
#include <linux/magic.h>
+#include <linux/migrate.h>
#include <asm/uaccess.h>
@@ -573,6 +574,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
return 0;
}
+static int hugetlbfs_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page)
+{
+ int rc;
+
+ rc = migrate_huge_page_move_mapping(mapping, newpage, page);
+ if (rc)
+ return rc;
+ migrate_page_copy(newpage, page);
+
+ return 0;
+}
+
static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
@@ -659,6 +673,7 @@ static const struct ...In order to avoid metadata corruption when memory failure occurs between
alloc_huge_page() and lock_page().
The corruption occurs because page fault can fail with metadata changes
remained (such as refcount, mapcount, etc.)
Since the PageHWPoison() check is for avoiding hwpoisoned page remained
in pagecache mapping to the process, it should be done in "found in pagecache"
branch, not in the common path.
This patch moves the check to "found in pagecache" branch and fix the problem.
ChangeLog since v2:
- remove retry check in "new allocation" path.
- make description more detailed
- change patch name from "HWPOISON, hugetlb: move PG_HWPoison bit check"
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
---
mm/hugetlb.c | 21 +++++++++------------
1 files changed, 9 insertions(+), 12 deletions(-)
diff --git v2.6.36-rc2/mm/hugetlb.c v2.6.36-rc2/mm/hugetlb.c
index cc5be78..6871b41 100644
--- v2.6.36-rc2/mm/hugetlb.c
+++ v2.6.36-rc2/mm/hugetlb.c
@@ -2518,22 +2518,19 @@ retry:
hugepage_add_new_anon_rmap(page, vma, address);
}
} else {
+ /*
+ * If memory error occurs between mmap() and fault, some process
+ * don't have hwpoisoned swap entry for errored virtual address.
+ * So we need to block hugepage fault by PG_hwpoison bit check.
+ */
+ if (unlikely(PageHWPoison(page))) {
+ ret = VM_FAULT_HWPOISON;
+ goto backout_unlocked;
+ }
page_dup_rmap(page);
}
/*
- * Since memory error handler replaces pte into hwpoison swap entry
- * at the time of error handling, a process which reserved but not have
- * the mapping to the error hugepage does not have hwpoison swap entry.
- * So we need to block accesses from such a process by checking
- * PG_hwpoison bit here.
- */
- if (unlikely(PageHWPoison(page))) {
- ret = VM_FAULT_HWPOISON;
- goto backout_unlocked;
- }
-
- /*
* If we are going to COW a private mapping later, we examine the
* pending reservations ...Reviewed-by: Wu Fengguang <fengguang.wu@intel.com> --
This patch modifies hugepage copy functions to have only destination
and source hugepages as arguments for later use.
The old ones are renamed from copy_{gigantic,huge}_page() to
copy_user_{gigantic,huge}_page().
This naming convention is consistent with that between copy_highpage()
and copy_user_highpage().
ChangeLog since v2:
- change copy_huge_page() from macro to inline dummy function
to avoid compile warning when !CONFIG_HUGETLB_PAGE.
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
---
include/linux/hugetlb.h | 4 ++++
mm/hugetlb.c | 43 +++++++++++++++++++++++++++++++++++++++----
2 files changed, 43 insertions(+), 4 deletions(-)
diff --git v2.6.36-rc2/include/linux/hugetlb.h v2.6.36-rc2/include/linux/hugetlb.h
index 0b73c53..9e51f77 100644
--- v2.6.36-rc2/include/linux/hugetlb.h
+++ v2.6.36-rc2/include/linux/hugetlb.h
@@ -44,6 +44,7 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
int acctflags);
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
void __isolate_hwpoisoned_huge_page(struct page *page);
+void copy_huge_page(struct page *dst, struct page *src);
extern unsigned long hugepages_treat_as_movable;
extern const unsigned long hugetlb_zero, hugetlb_infinity;
@@ -102,6 +103,9 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
#define huge_pte_offset(mm, address) 0
#define __isolate_hwpoisoned_huge_page(page) 0
+static inline void copy_huge_page(struct page *dst, struct page *src)
+{
+}
#define hugetlb_change_protection(vma, address, end, newprot)
diff --git v2.6.36-rc2/mm/hugetlb.c v2.6.36-rc2/mm/hugetlb.c
index 674a25e..283563d 100644
--- v2.6.36-rc2/mm/hugetlb.c
+++ v2.6.36-rc2/mm/hugetlb.c
@@ -423,7 +423,7 @@ static void clear_huge_page(struct page *page,
}
}
-static void copy_gigantic_page(struct page *dst, struct page *src,
+static void copy_user_gigantic_page(struct ...We can't use existing hugepage allocation functions to allocate hugepage
for page migration, because page migration can happen asynchronously with
the running processes and page migration users should call the allocation
function with physical addresses (not virtual addresses) as arguments.
ChangeLog since v2:
- remove unnecessary get/put_mems_allowed() (thanks to David Rientjes)
ChangeLog since v1:
- add comment on top of alloc_huge_page_no_vma()
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
---
include/linux/hugetlb.h | 3 ++
mm/hugetlb.c | 90 ++++++++++++++++++++++++++++++++++++-----------
2 files changed, 72 insertions(+), 21 deletions(-)
diff --git v2.6.36-rc2/include/linux/hugetlb.h v2.6.36-rc2/include/linux/hugetlb.h
index f479700..142bd4f 100644
--- v2.6.36-rc2/include/linux/hugetlb.h
+++ v2.6.36-rc2/include/linux/hugetlb.h
@@ -228,6 +228,8 @@ struct huge_bootmem_page {
struct hstate *hstate;
};
+struct page *alloc_huge_page_no_vma_node(struct hstate *h, int nid);
+
/* arch callback */
int __init alloc_bootmem_huge_page(struct hstate *h);
@@ -303,6 +305,7 @@ static inline struct hstate *page_hstate(struct page *page)
#else
struct hstate {};
+#define alloc_huge_page_no_vma_node(h, nid) NULL
#define alloc_bootmem_huge_page(h) NULL
#define hstate_file(f) NULL
#define hstate_vma(v) NULL
diff --git v2.6.36-rc2/mm/hugetlb.c v2.6.36-rc2/mm/hugetlb.c
index 6871b41..31118d2 100644
--- v2.6.36-rc2/mm/hugetlb.c
+++ v2.6.36-rc2/mm/hugetlb.c
@@ -466,11 +466,22 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
h->free_huge_pages_node[nid]++;
}
+static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
+{
+ struct page *page;
+ if (list_empty(&h->hugepage_freelists[nid]))
+ return NULL;
+ page = list_entry(h->hugepage_freelists[nid].next, struct page, ...One would expect the alloc_buddy_huge_page_node() to only differ with alloc_buddy_huge_page() in the alloc_pages/alloc_pages_exact_node calls. However you implement alloc_buddy_huge_page_node() in a quite different way. Can the two functions be unified at all? Thanks, Fengguang --
Yes. I did it by adding argument @nid to alloc_buddy_huge_page(). Code gets cleaner and work without problems. Thanks, Naoya --
This patch extends soft offlining framework to support hugepage.
When memory corrected errors occur repeatedly on a hugepage,
we can choose to stop using it by migrating data onto another hugepage
and disabling the original (maybe half-broken) one.
ChangeLog since v2:
- move refcount handling into isolate_lru_page()
ChangeLog since v1:
- add double check in isolating hwpoisoned hugepage
- define free/non-free checker for hugepage
- postpone calling put_page() for hugepage in soft_offline_page()
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
---
include/linux/hugetlb.h | 2 +
mm/hugetlb.c | 26 ++++++++++++++++++++++++
mm/memory-failure.c | 49 ++++++++++++++++++++++++++++++++--------------
3 files changed, 62 insertions(+), 15 deletions(-)
diff --git v2.6.36-rc2/include/linux/hugetlb.h v2.6.36-rc2/include/linux/hugetlb.h
index 9e51f77..c8c1891 100644
--- v2.6.36-rc2/include/linux/hugetlb.h
+++ v2.6.36-rc2/include/linux/hugetlb.h
@@ -44,6 +44,7 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
int acctflags);
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
void __isolate_hwpoisoned_huge_page(struct page *page);
+void isolate_hwpoisoned_huge_page(struct page *page);
void copy_huge_page(struct page *dst, struct page *src);
extern unsigned long hugepages_treat_as_movable;
@@ -103,6 +104,7 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
#define huge_pte_offset(mm, address) 0
#define __isolate_hwpoisoned_huge_page(page) 0
+#define isolate_hwpoisoned_huge_page(page) 0
static inline void copy_huge_page(struct page *dst, struct page *src)
{
}
diff --git v2.6.36-rc2/mm/hugetlb.c v2.6.36-rc2/mm/hugetlb.c
index 57bc1ec..dee7972 100644
--- v2.6.36-rc2/mm/hugetlb.c
+++ v2.6.36-rc2/mm/hugetlb.c
@@ -2987,3 +2987,29 @@ void ...However it should still be racy if the test/isolate actions are not performed in the same hugetlb_lock. Thanks, Fengguang --
Currently unpoisoning hugepages doesn't work because it's not enough
to just clear PG_HWPoison bits and we need to link the hugepage
to be unpoisoned back to the free hugepage list.
To do this, we get and put hwpoisoned hugepage whose refcount is 0.
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
---
mm/memory-failure.c | 16 +++++++++++++---
1 files changed, 13 insertions(+), 3 deletions(-)
diff --git v2.6.36-rc2/mm/memory-failure.c v2.6.36-rc2/mm/memory-failure.c
index 60178d2..ab36690 100644
--- v2.6.36-rc2/mm/memory-failure.c
+++ v2.6.36-rc2/mm/memory-failure.c
@@ -1154,9 +1154,19 @@ int unpoison_memory(unsigned long pfn)
nr_pages = 1 << compound_order(page);
if (!get_page_unless_zero(page)) {
- if (TestClearPageHWPoison(p))
+ /* The page to be unpoisoned was free one when hwpoisoned */
+ if (TestClearPageHWPoison(page))
atomic_long_sub(nr_pages, &mce_bad_pages);
pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
+ if (PageHuge(page)) {
+ /*
+ * To unpoison free hugepage, we get and put it
+ * to move it back to the free list.
+ */
+ get_page(page);
+ clear_page_hwpoison_huge_page(page);
+ put_page(page);
+ }
return 0;
}
@@ -1171,9 +1181,9 @@ int unpoison_memory(unsigned long pfn)
pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
atomic_long_sub(nr_pages, &mce_bad_pages);
freeit = 1;
+ if (PageHuge(page))
+ clear_page_hwpoison_huge_page(page);
}
- if (PageHuge(p))
- clear_page_hwpoison_huge_page(page);
unlock_page(page);
put_page(page);
--
1.7.2.1
--
It's racy in free huge page detection.
alloc_huge_page() does not increase page refcount inside hugetlb_lock,
the alloc_huge_page()=>alloc_buddy_huge_page() path even drops the
lock temporarily! Then we never know reliably if a huge page is really
free.
Here is a scratched fix. It is totally untested. Just want to notice
you that with this patch, the huge page unpoisoning should go easier.
Thanks,
Fengguang
---
include/linux/hugetlb.h | 4 +-
mm/hugetlb.c | 51 +++++++++++++++++---------------------
mm/memory-failure.c | 24 ++++++-----------
3 files changed, 34 insertions(+), 45 deletions(-)
--- linux-next.orig/mm/hugetlb.c 2010-08-25 10:16:15.000000000 +0800
+++ linux-next/mm/hugetlb.c 2010-08-25 10:47:17.000000000 +0800
@@ -502,6 +502,7 @@ static struct page *dequeue_huge_page_vm
page = list_entry(h->hugepage_freelists[nid].next,
struct page, lru);
list_del(&page->lru);
+ set_page_refcounted(page);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
@@ -822,12 +823,6 @@ static struct page *alloc_buddy_huge_pag
spin_lock(&hugetlb_lock);
if (page) {
- /*
- * This page is now managed by the hugetlb allocator and has
- * no users -- drop the buddy allocator's reference.
- */
- put_page_testzero(page);
- VM_BUG_ON(page_count(page));
nid = page_to_nid(page);
set_compound_page_dtor(page, free_huge_page);
/*
@@ -877,8 +872,6 @@ retry:
* satisfy the entire reservation so we free what
* we've allocated so far.
*/
- spin_lock(&hugetlb_lock);
- needed = 0;
goto free;
}
@@ -904,33 +897,28 @@ retry:
* process from stealing the pages as they are added to the pool but
* before they are reserved.
*/
- needed += allocated;
h->resv_huge_pages += delta;
ret = 0;
-free:
+
/* Free the needed pages to the hugetlb pool */
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
- if ((--needed) < 0)
- break;
...Great. I adjusted this patch to real hugetlb code and passed libhugetlbfs test. Thanks, Naoya --
debugfs:hwpoison/renew-pfn is the old interface.
This patch renames and fixes it.
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
---
Documentation/vm/page-types.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git v2.6.36-rc2/Documentation/vm/page-types.c v2.6.36-rc2/Documentation/vm/page-types.c
index ccd951f..cc96ee2 100644
--- v2.6.36-rc2/Documentation/vm/page-types.c
+++ v2.6.36-rc2/Documentation/vm/page-types.c
@@ -478,7 +478,7 @@ static void prepare_hwpoison_fd(void)
}
if (opt_unpoison && !hwpoison_forget_fd) {
- sprintf(buf, "%s/renew-pfn", hwpoison_debug_fs);
+ sprintf(buf, "%s/unpoison-pfn", hwpoison_debug_fs);
hwpoison_forget_fd = checked_open(buf, O_WRONLY);
}
}
--
1.7.2.1
--
