Hi,
This is the 2nd version of "hugepage migration" patchset.
There were two points of issue.
* Dividing hugepage migration functions from original migration code.
This is to avoid complexity.
In present version, some high level migration routines are defined to handle
hugepage, but some low level routines (such as migrate_copy_page() etc.)
are shared with original migration code in order not to increase duplication.
* Locking problem between direct I/O and hugepage migration
As a result of digging the race between hugepage I/O and hugepage migration,
(where hugepage I/O can be seen only in direct I/O,)
I noticed that without additional locking we can avoid this race condition
because in direct I/O we can get whether some subpages are under I/O or not
from reference count of the head page and hugepage migration safely fails
if some references remain. So no data lost should occurs on the migration
concurrent with direct I/O.
This patchset is based on the following commit:
commit 1c9bc0d7945bbbcdae99f197535588e5ad24bc1c
"hugetlb: add missing unlock in avoidcopy path in hugetlb_cow()"
on "hwpoison" branch in Andi's tree.
http://git.kernel.org/?p=linux/kernel/git/ak/linux-mce-2.6.git;a=summary
Summary:
[PATCH 1/9] HWPOISON, hugetlb: move PG_HWPoison bit check
[PATCH 2/9] hugetlb: add allocate function for hugepage migration
[PATCH 3/9] hugetlb: rename hugepage allocation functions
[PATCH 4/9] hugetlb: redefine hugepage copy functions
[PATCH 5/9] hugetlb: hugepage migration core
[PATCH 6/9] HWPOISON, hugetlb: soft offlining for hugepage
[PATCH 7/9] HWPOISON, hugetlb: fix unpoison for hugepage
[PATCH 8/9] page-types.c: fix name of unpoison interface
[PATCH 9/9] hugetlb: add corrupted hugepage counter
Documentation/vm/page-types.c | 2 +-
fs/hugetlbfs/inode.c | 15 +++
include/linux/hugetlb.h | 12 ++
include/linux/migrate.h | 12 ++
mm/hugetlb.c | 248 +++++++++++++++++++++++++++++++++--------
mm/memory-failure.c | 88 ++++++++++++---
mm/migrate.c | 196 +++++++++++++++++++++++++++++----
7 files changed, 487 insertions(+), 86 deletions(-)
Thanks,
Naoya Horiguchi
We can't use existing hugepage allocation functions to allocate hugepage
for page migration, because page migration can happen asynchronously with
the running processes and page migration users should call the allocation
function with physical addresses (not virtual addresses) as arguments.
ChangeLog:
- add comment on top of alloc_huge_page_no_vma()
Signed-off-by: Naoya Horiguchi <[email protected]>
Signed-off-by: Jun'ichi Nomura <[email protected]>
---
include/linux/hugetlb.h | 3 ++
mm/hugetlb.c | 90 +++++++++++++++++++++++++++++++++++++----------
2 files changed, 74 insertions(+), 19 deletions(-)
diff --git linux-mce-hwpoison/include/linux/hugetlb.h linux-mce-hwpoison/include/linux/hugetlb.h
index f479700..142bd4f 100644
--- linux-mce-hwpoison/include/linux/hugetlb.h
+++ linux-mce-hwpoison/include/linux/hugetlb.h
@@ -228,6 +228,8 @@ struct huge_bootmem_page {
struct hstate *hstate;
};
+struct page *alloc_huge_page_no_vma_node(struct hstate *h, int nid);
+
/* arch callback */
int __init alloc_bootmem_huge_page(struct hstate *h);
@@ -303,6 +305,7 @@ static inline struct hstate *page_hstate(struct page *page)
#else
struct hstate {};
+#define alloc_huge_page_no_vma_node(h, nid) NULL
#define alloc_bootmem_huge_page(h) NULL
#define hstate_file(f) NULL
#define hstate_vma(v) NULL
diff --git linux-mce-hwpoison/mm/hugetlb.c linux-mce-hwpoison/mm/hugetlb.c
index 5c77a73..2815b83 100644
--- linux-mce-hwpoison/mm/hugetlb.c
+++ linux-mce-hwpoison/mm/hugetlb.c
@@ -466,11 +466,22 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
h->free_huge_pages_node[nid]++;
}
+static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
+{
+ struct page *page;
+ if (list_empty(&h->hugepage_freelists[nid]))
+ return NULL;
+ page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
+ list_del(&page->lru);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ return page;
+}
+
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve)
{
- int nid;
struct page *page = NULL;
struct mempolicy *mpol;
nodemask_t *nodemask;
@@ -496,19 +507,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) {
- nid = zone_to_nid(zone);
- if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
- !list_empty(&h->hugepage_freelists[nid])) {
- page = list_entry(h->hugepage_freelists[nid].next,
- struct page, lru);
- list_del(&page->lru);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
-
- if (!avoid_reserve)
- decrement_hugepage_resv_vma(h, vma);
-
- break;
+ if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
+ page = dequeue_huge_page_node(h, zone_to_nid(zone));
+ if (page) {
+ if (!avoid_reserve)
+ decrement_hugepage_resv_vma(h, vma);
+ break;
+ }
}
}
err:
@@ -616,7 +621,7 @@ int PageHuge(struct page *page)
}
EXPORT_SYMBOL_GPL(PageHuge);
-static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
+static struct page *__alloc_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
@@ -627,14 +632,61 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN,
huge_page_order(h));
+ if (page && arch_prepare_hugepage(page)) {
+ __free_pages(page, huge_page_order(h));
+ return NULL;
+ }
+
+ return page;
+}
+
+static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
+{
+ struct page *page = __alloc_huge_page_node(h, nid);
+ if (page)
+ prep_new_huge_page(h, page, nid);
+ return page;
+}
+
+static struct page *alloc_buddy_huge_page_node(struct hstate *h, int nid)
+{
+ struct page *page = __alloc_huge_page_node(h, nid);
if (page) {
- if (arch_prepare_hugepage(page)) {
- __free_pages(page, huge_page_order(h));
+ set_compound_page_dtor(page, free_huge_page);
+ spin_lock(&hugetlb_lock);
+ h->nr_huge_pages++;
+ h->nr_huge_pages_node[nid]++;
+ spin_unlock(&hugetlb_lock);
+ put_page_testzero(page);
+ }
+ return page;
+}
+
+/*
+ * This allocation function is useful in the context where vma is irrelevant.
+ * E.g. soft-offlining uses this function because it only cares physical
+ * address of error page.
+ */
+struct page *alloc_huge_page_no_vma_node(struct hstate *h, int nid)
+{
+ struct page *page;
+
+ spin_lock(&hugetlb_lock);
+ get_mems_allowed();
+ page = dequeue_huge_page_node(h, nid);
+ put_mems_allowed();
+ spin_unlock(&hugetlb_lock);
+
+ if (!page) {
+ page = alloc_buddy_huge_page_node(h, nid);
+ if (!page) {
+ __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
return NULL;
- }
- prep_new_huge_page(h, page, nid);
+ } else
+ __count_vm_event(HTLB_BUDDY_PGALLOC);
}
+ set_page_refcounted(page);
return page;
}
--
1.7.2.1
This patch extends page migration code to support hugepage migration.
One of the potential users of this feature is soft offlining which
is triggered by memory corrected errors (added by the next patch.)
Todo: there are other users of page migration such as memory policy,
memory hotplug and memocy compaction.
They are not ready for hugepage support for now.
ChangeLog since v1:
- divide migration code path for hugepage
- define routine checking migration swap entry for hugetlb
- replace "goto" with "if/else" in remove_migration_pte()
Signed-off-by: Naoya Horiguchi <[email protected]>
Signed-off-by: Jun'ichi Nomura <[email protected]>
---
fs/hugetlbfs/inode.c | 15 ++++
include/linux/migrate.h | 12 +++
mm/hugetlb.c | 18 ++++-
mm/migrate.c | 196 ++++++++++++++++++++++++++++++++++++++++++-----
4 files changed, 220 insertions(+), 21 deletions(-)
diff --git linux-mce-hwpoison/fs/hugetlbfs/inode.c linux-mce-hwpoison/fs/hugetlbfs/inode.c
index a4e9a7e..fee99e8 100644
--- linux-mce-hwpoison/fs/hugetlbfs/inode.c
+++ linux-mce-hwpoison/fs/hugetlbfs/inode.c
@@ -31,6 +31,7 @@
#include <linux/statfs.h>
#include <linux/security.h>
#include <linux/magic.h>
+#include <linux/migrate.h>
#include <asm/uaccess.h>
@@ -589,6 +590,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
return 0;
}
+static int hugetlbfs_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page)
+{
+ int rc;
+
+ rc = migrate_huge_page_move_mapping(mapping, newpage, page);
+ if (rc)
+ return rc;
+ migrate_page_copy(newpage, page);
+
+ return 0;
+}
+
static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
@@ -675,6 +689,7 @@ static const struct address_space_operations hugetlbfs_aops = {
.write_begin = hugetlbfs_write_begin,
.write_end = hugetlbfs_write_end,
.set_page_dirty = hugetlbfs_set_page_dirty,
+ .migratepage = hugetlbfs_migrate_page,
};
diff --git linux-mce-hwpoison/include/linux/migrate.h linux-mce-hwpoison/include/linux/migrate.h
index 7238231..f4c15ff 100644
--- linux-mce-hwpoison/include/linux/migrate.h
+++ linux-mce-hwpoison/include/linux/migrate.h
@@ -23,6 +23,9 @@ extern int migrate_prep_local(void);
extern int migrate_vmas(struct mm_struct *mm,
const nodemask_t *from, const nodemask_t *to,
unsigned long flags);
+extern void migrate_page_copy(struct page *newpage, struct page *page);
+extern int migrate_huge_page_move_mapping(struct address_space *mapping,
+ struct page *newpage, struct page *page);
#else
#define PAGE_MIGRATION 0
@@ -40,6 +43,15 @@ static inline int migrate_vmas(struct mm_struct *mm,
return -ENOSYS;
}
+static inline void migrate_page_copy(struct page *newpage,
+ struct page *page) {}
+
+extern int migrate_huge_page_move_mapping(struct address_space *mapping,
+ struct page *newpage, struct page *page)
+{
+ return -ENOSYS;
+}
+
/* Possible settings for the migrate_page() method in address_operations */
#define migrate_page NULL
#define fail_migrate_page NULL
diff --git linux-mce-hwpoison/mm/hugetlb.c linux-mce-hwpoison/mm/hugetlb.c
index 2fb8679..0805524 100644
--- linux-mce-hwpoison/mm/hugetlb.c
+++ linux-mce-hwpoison/mm/hugetlb.c
@@ -2239,6 +2239,19 @@ nomem:
return -ENOMEM;
}
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_migration_entry(swp)) {
+ return 1;
+ } else
+ return 0;
+}
+
static int is_hugetlb_entry_hwpoisoned(pte_t pte)
{
swp_entry_t swp;
@@ -2678,7 +2691,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, address);
if (ptep) {
entry = huge_ptep_get(ptep);
- if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ if (unlikely(is_hugetlb_entry_migration(entry))) {
+ migration_entry_wait(mm, (pmd_t *)ptep, address);
+ return 0;
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON;
}
diff --git linux-mce-hwpoison/mm/migrate.c linux-mce-hwpoison/mm/migrate.c
index 4205b1d..7f9a37c 100644
--- linux-mce-hwpoison/mm/migrate.c
+++ linux-mce-hwpoison/mm/migrate.c
@@ -32,6 +32,7 @@
#include <linux/security.h>
#include <linux/memcontrol.h>
#include <linux/syscalls.h>
+#include <linux/hugetlb.h>
#include <linux/gfp.h>
#include "internal.h"
@@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pte_t *ptep, pte;
spinlock_t *ptl;
- pgd = pgd_offset(mm, addr);
- if (!pgd_present(*pgd))
- goto out;
+ if (unlikely(PageHuge(new))) {
+ ptep = huge_pte_offset(mm, addr);
+ if (!ptep)
+ goto out;
+ ptl = &mm->page_table_lock;
+ } else {
+ pgd = pgd_offset(mm, addr);
+ if (!pgd_present(*pgd))
+ goto out;
- pud = pud_offset(pgd, addr);
- if (!pud_present(*pud))
- goto out;
+ pud = pud_offset(pgd, addr);
+ if (!pud_present(*pud))
+ goto out;
- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd))
- goto out;
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd))
+ goto out;
- ptep = pte_offset_map(pmd, addr);
+ ptep = pte_offset_map(pmd, addr);
- if (!is_swap_pte(*ptep)) {
- pte_unmap(ptep);
- goto out;
- }
+ if (!is_swap_pte(*ptep)) {
+ pte_unmap(ptep);
+ goto out;
+ }
+
+ ptl = pte_lockptr(mm, pmd);
+ }
- ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
pte = *ptep;
if (!is_swap_pte(pte))
@@ -130,10 +139,17 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
if (is_write_migration_entry(entry))
pte = pte_mkwrite(pte);
+ if (PageHuge(new))
+ pte = pte_mkhuge(pte);
flush_cache_page(vma, addr, pte_pfn(pte));
set_pte_at(mm, addr, ptep, pte);
- if (PageAnon(new))
+ if (PageHuge(new)) {
+ if (PageAnon(new))
+ hugepage_add_anon_rmap(new, vma, addr);
+ else
+ page_dup_rmap(new);
+ } else if (PageAnon(new))
page_add_anon_rmap(new, vma, addr);
else
page_add_file_rmap(new);
@@ -276,11 +292,59 @@ static int migrate_page_move_mapping(struct address_space *mapping,
}
/*
+ * The expected number of remaining references is the same as that
+ * of migrate_page_move_mapping().
+ */
+int migrate_huge_page_move_mapping(struct address_space *mapping,
+ struct page *newpage, struct page *page)
+{
+ int expected_count;
+ void **pslot;
+
+ if (!mapping) {
+ if (page_count(page) != 1)
+ return -EAGAIN;
+ return 0;
+ }
+
+ spin_lock_irq(&mapping->tree_lock);
+
+ pslot = radix_tree_lookup_slot(&mapping->page_tree,
+ page_index(page));
+
+ expected_count = 2 + page_has_private(page);
+ if (page_count(page) != expected_count ||
+ (struct page *)radix_tree_deref_slot(pslot) != page) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+
+ if (!page_freeze_refs(page, expected_count)) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+
+ get_page(newpage);
+
+ radix_tree_replace_slot(pslot, newpage);
+
+ page_unfreeze_refs(page, expected_count);
+
+ __put_page(page);
+
+ spin_unlock_irq(&mapping->tree_lock);
+ return 0;
+}
+
+/*
* Copy the page to its new location
*/
-static void migrate_page_copy(struct page *newpage, struct page *page)
+void migrate_page_copy(struct page *newpage, struct page *page)
{
- copy_highpage(newpage, page);
+ if (PageHuge(page))
+ copy_huge_page(newpage, page);
+ else
+ copy_highpage(newpage, page);
if (PageError(page))
SetPageError(newpage);
@@ -728,6 +792,86 @@ move_newpage:
}
/*
+ * Counterpart of unmap_and_move_page() for hugepage migration.
+ *
+ * This function doesn't wait the completion of hugepage I/O
+ * because there is no race between I/O and migration for hugepage.
+ * Note that currently hugepage I/O occurs only in direct I/O
+ * where no lock is held and PG_writeback is irrelevant,
+ * and writeback status of all subpages are counted in the reference
+ * count of the head page (i.e. if all subpages of a 2MB hugepage are
+ * under direct I/O, the reference of the head page is 512 and a bit more.)
+ * This means that when we try to migrate hugepage whose subpages are
+ * doing direct I/O, some references remain after try_to_unmap() and
+ * hugepage migration fails without data corruption.
+ */
+static int unmap_and_move_huge_page(new_page_t get_new_page,
+ unsigned long private, struct page *hpage,
+ int force, int offlining)
+{
+ int rc = 0;
+ int *result = NULL;
+ struct page *new_hpage = get_new_page(hpage, private, &result);
+ int rcu_locked = 0;
+ struct anon_vma *anon_vma = NULL;
+
+ if (!new_hpage)
+ return -ENOMEM;
+
+ rc = -EAGAIN;
+
+ if (!trylock_page(hpage)) {
+ if (!force)
+ goto out;
+ lock_page(hpage);
+ }
+
+ if (PageAnon(hpage)) {
+ rcu_read_lock();
+ rcu_locked = 1;
+
+ if (page_mapped(hpage)) {
+ anon_vma = page_anon_vma(hpage);
+ atomic_inc(&anon_vma->external_refcount);
+ }
+ }
+
+ try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+
+ if (!page_mapped(hpage))
+ rc = move_to_new_page(new_hpage, hpage, 1);
+
+ if (rc)
+ remove_migration_ptes(hpage, hpage);
+
+ if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
+ &anon_vma->lock)) {
+ int empty = list_empty(&anon_vma->head);
+ spin_unlock(&anon_vma->lock);
+ if (empty)
+ anon_vma_free(anon_vma);
+ }
+
+ if (rcu_locked)
+ rcu_read_unlock();
+out:
+ unlock_page(hpage);
+
+ if (rc != -EAGAIN)
+ put_page(hpage);
+
+ put_page(new_hpage);
+
+ if (result) {
+ if (rc)
+ *result = rc;
+ else
+ *result = page_to_nid(new_hpage);
+ }
+ return rc;
+}
+
+/*
* migrate_pages
*
* The function takes one list of pages to migrate and a function
@@ -751,6 +895,7 @@ int migrate_pages(struct list_head *from,
struct page *page2;
int swapwrite = current->flags & PF_SWAPWRITE;
int rc;
+ int putback_lru = 1;
if (!swapwrite)
current->flags |= PF_SWAPWRITE;
@@ -761,7 +906,17 @@ int migrate_pages(struct list_head *from,
list_for_each_entry_safe(page, page2, from, lru) {
cond_resched();
- rc = unmap_and_move(get_new_page, private,
+ /*
+ * Hugepage should be handled differently from
+ * non-hugepage because it's not linked to LRU list
+ * and reference counting policy is different.
+ */
+ if (PageHuge(page)) {
+ rc = unmap_and_move_huge_page(get_new_page,
+ private, page, pass > 2, offlining);
+ putback_lru = 0;
+ } else
+ rc = unmap_and_move(get_new_page, private,
page, pass > 2, offlining);
switch(rc) {
@@ -784,7 +939,8 @@ out:
if (!swapwrite)
current->flags &= ~PF_SWAPWRITE;
- putback_lru_pages(from);
+ if (putback_lru)
+ putback_lru_pages(from);
if (rc)
return rc;
--
1.7.2.1
In order to handle metadatum correctly, we should check whether the hugepage
we are going to access is HWPOISONed *before* incrementing mapcount,
adding the hugepage into pagecache or constructing anon_vma.
This patch also adds retry code when there is a race between
alloc_huge_page() and memory failure.
Signed-off-by: Naoya Horiguchi <[email protected]>
Signed-off-by: Jun'ichi Nomura <[email protected]>
---
mm/hugetlb.c | 34 +++++++++++++++++++++-------------
1 files changed, 21 insertions(+), 13 deletions(-)
diff --git linux-mce-hwpoison/mm/hugetlb.c linux-mce-hwpoison/mm/hugetlb.c
index a26c24a..5c77a73 100644
--- linux-mce-hwpoison/mm/hugetlb.c
+++ linux-mce-hwpoison/mm/hugetlb.c
@@ -2490,8 +2490,15 @@ retry:
int err;
struct inode *inode = mapping->host;
- err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+ lock_page(page);
+ if (unlikely(PageHWPoison(page))) {
+ unlock_page(page);
+ goto retry;
+ }
+ err = add_to_page_cache_locked(page, mapping,
+ idx, GFP_KERNEL);
if (err) {
+ unlock_page(page);
put_page(page);
if (err == -EEXIST)
goto retry;
@@ -2504,6 +2511,10 @@ retry:
page_dup_rmap(page);
} else {
lock_page(page);
+ if (unlikely(PageHWPoison(page))) {
+ unlock_page(page);
+ goto retry;
+ }
if (unlikely(anon_vma_prepare(vma))) {
ret = VM_FAULT_OOM;
goto backout_unlocked;
@@ -2511,22 +2522,19 @@ retry:
hugepage_add_new_anon_rmap(page, vma, address);
}
} else {
+ /*
+ * If memory error occurs between mmap() and fault, some process
+ * don't have hwpoisoned swap entry for errored virtual address.
+ * So we need to block hugepage fault by PG_hwpoison bit check.
+ */
+ if (unlikely(PageHWPoison(page))) {
+ ret = VM_FAULT_HWPOISON;
+ goto backout_unlocked;
+ }
page_dup_rmap(page);
}
/*
- * Since memory error handler replaces pte into hwpoison swap entry
- * at the time of error handling, a process which reserved but not have
- * the mapping to the error hugepage does not have hwpoison swap entry.
- * So we need to block accesses from such a process by checking
- * PG_hwpoison bit here.
- */
- if (unlikely(PageHWPoison(page))) {
- ret = VM_FAULT_HWPOISON;
- goto backout_unlocked;
- }
-
- /*
* If we are going to COW a private mapping later, we examine the
* pending reservations for this page now. This will ensure that
* any allocations necessary to record that reservation occur outside
--
1.7.2.1
This patch extends soft offlining framework to support hugepage.
When memory corrected errors occur repeatedly on a hugepage,
we can choose to stop using it by migrating data onto another hugepage
and disabling the original (maybe half-broken) one.
ChangeLog since v1:
- add double check in isolating hwpoisoned hugepage
- define free/non-free checker for hugepage
- postpone calling put_page() for hugepage in soft_offline_page()
Signed-off-by: Naoya Horiguchi <[email protected]>
Signed-off-by: Jun'ichi Nomura <[email protected]>
---
include/linux/hugetlb.h | 2 +
mm/hugetlb.c | 26 +++++++++++++++++
mm/memory-failure.c | 70 ++++++++++++++++++++++++++++++++++++-----------
3 files changed, 82 insertions(+), 16 deletions(-)
diff --git linux-mce-hwpoison/include/linux/hugetlb.h linux-mce-hwpoison/include/linux/hugetlb.h
index f77d2ba..2b7de04 100644
--- linux-mce-hwpoison/include/linux/hugetlb.h
+++ linux-mce-hwpoison/include/linux/hugetlb.h
@@ -44,6 +44,7 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
int acctflags);
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
void __isolate_hwpoisoned_huge_page(struct page *page);
+void isolate_hwpoisoned_huge_page(struct page *page);
void copy_huge_page(struct page *dst, struct page *src);
extern unsigned long hugepages_treat_as_movable;
@@ -103,6 +104,7 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
#define huge_pte_offset(mm, address) 0
#define __isolate_hwpoisoned_huge_page(page) 0
+#define isolate_hwpoisoned_huge_page(page) 0
#define copy_huge_page(dst, src) NULL
#define hugetlb_change_protection(vma, address, end, newprot)
diff --git linux-mce-hwpoison/mm/hugetlb.c linux-mce-hwpoison/mm/hugetlb.c
index 0805524..2a61a8f 100644
--- linux-mce-hwpoison/mm/hugetlb.c
+++ linux-mce-hwpoison/mm/hugetlb.c
@@ -2995,3 +2995,29 @@ void __isolate_hwpoisoned_huge_page(struct page *hpage)
h->free_huge_pages_node[nid]--;
spin_unlock(&hugetlb_lock);
}
+
+static int is_hugepage_on_freelist(struct page *hpage)
+{
+ struct page *page;
+ struct page *tmp;
+ struct hstate *h = page_hstate(hpage);
+ int nid = page_to_nid(hpage);
+
+ spin_lock(&hugetlb_lock);
+ list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) {
+ if (page == hpage) {
+ spin_unlock(&hugetlb_lock);
+ return 1;
+ }
+ }
+ spin_unlock(&hugetlb_lock);
+ return 0;
+}
+
+void isolate_hwpoisoned_huge_page(struct page *hpage)
+{
+ lock_page(hpage);
+ if (is_hugepage_on_freelist(hpage))
+ __isolate_hwpoisoned_huge_page(hpage);
+ unlock_page(hpage);
+}
diff --git linux-mce-hwpoison/mm/memory-failure.c linux-mce-hwpoison/mm/memory-failure.c
index d0b420a..0bfe5b3 100644
--- linux-mce-hwpoison/mm/memory-failure.c
+++ linux-mce-hwpoison/mm/memory-failure.c
@@ -1186,7 +1186,11 @@ EXPORT_SYMBOL(unpoison_memory);
static struct page *new_page(struct page *p, unsigned long private, int **x)
{
int nid = page_to_nid(p);
- return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+ if (PageHuge(p))
+ return alloc_huge_page_node(page_hstate(compound_head(p)),
+ nid);
+ else
+ return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
}
/*
@@ -1214,8 +1218,16 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
* was free.
*/
set_migratetype_isolate(p);
+ /*
+ * When the target page is a free hugepage, just remove it
+ * from free hugepage list.
+ */
if (!get_page_unless_zero(compound_head(p))) {
- if (is_free_buddy_page(p)) {
+ if (PageHuge(p)) {
+ pr_debug("get_any_page: %#lx free huge page\n", pfn);
+ ret = 0;
+ SetPageHWPoison(compound_head(p));
+ } else if (is_free_buddy_page(p)) {
pr_debug("get_any_page: %#lx free buddy page\n", pfn);
/* Set hwpoison bit while page is still isolated */
SetPageHWPoison(p);
@@ -1260,6 +1272,7 @@ int soft_offline_page(struct page *page, int flags)
{
int ret;
unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_head(page);
ret = get_any_page(page, pfn, flags);
if (ret < 0)
@@ -1270,7 +1283,7 @@ int soft_offline_page(struct page *page, int flags)
/*
* Page cache page we can handle?
*/
- if (!PageLRU(page)) {
+ if (!PageLRU(page) && !PageHuge(page)) {
/*
* Try to free it.
*/
@@ -1286,21 +1299,21 @@ int soft_offline_page(struct page *page, int flags)
if (ret == 0)
goto done;
}
- if (!PageLRU(page)) {
+ if (!PageLRU(page) && !PageHuge(page)) {
pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
pfn, page->flags);
return -EIO;
}
- lock_page(page);
- wait_on_page_writeback(page);
+ lock_page(hpage);
+ wait_on_page_writeback(hpage);
/*
* Synchronized using the page lock with memory_failure()
*/
- if (PageHWPoison(page)) {
- unlock_page(page);
- put_page(page);
+ if (PageHWPoison(hpage)) {
+ unlock_page(hpage);
+ put_page(hpage);
pr_debug("soft offline: %#lx page already poisoned\n", pfn);
return -EBUSY;
}
@@ -1310,7 +1323,7 @@ int soft_offline_page(struct page *page, int flags)
* non dirty unmapped page cache pages.
*/
ret = invalidate_inode_page(page);
- unlock_page(page);
+ unlock_page(hpage);
/*
* Drop count because page migration doesn't like raised
@@ -1318,8 +1331,13 @@ int soft_offline_page(struct page *page, int flags)
* LRU the isolation will just fail.
* RED-PEN would be better to keep it isolated here, but we
* would need to fix isolation locking first.
+ *
+ * Postpone dropping count for hugepage until migration completes,
+ * because otherwise old hugepage will be freed before copying.
*/
- put_page(page);
+ if (!PageHuge(hpage))
+ put_page(hpage);
+
if (ret == 1) {
ret = 0;
pr_debug("soft_offline: %#lx: invalidated\n", pfn);
@@ -1330,19 +1348,33 @@ int soft_offline_page(struct page *page, int flags)
* Simple invalidation didn't work.
* Try to migrate to a new page instead. migrate.c
* handles a large number of cases for us.
+ *
+ * Hugepage has no link to LRU list, so just skip this.
*/
- ret = isolate_lru_page(page);
+ if (PageHuge(page))
+ ret = 0;
+ else
+ ret = isolate_lru_page(page);
+
if (!ret) {
LIST_HEAD(pagelist);
- list_add(&page->lru, &pagelist);
+ list_add(&hpage->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
if (ret) {
pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
+ /*
+ * When hugepage migration succeeded, the old hugepage
+ * should already be freed, so we put it only
+ * in the failure path.
+ */
+ if (PageHuge(hpage))
+ put_page(hpage);
}
+
} else {
pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
pfn, ret, page_count(page), page->flags);
@@ -1351,8 +1383,14 @@ int soft_offline_page(struct page *page, int flags)
return ret;
done:
- atomic_long_add(1, &mce_bad_pages);
- SetPageHWPoison(page);
- /* keep elevated page count for bad page */
+ if (!PageHWPoison(hpage))
+ atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
+ if (PageHuge(hpage)) {
+ set_page_hwpoison_huge_page(hpage);
+ isolate_hwpoisoned_huge_page(hpage);
+ } else {
+ SetPageHWPoison(page);
+ /* keep elevated page count for bad page */
+ }
return ret;
}
--
1.7.2.1
This patch modifies hugepage copy functions to have only destination
and source hugepages as arguments for later use.
The old ones are renamed from copy_{gigantic,huge}_page() to
copy_user_{gigantic,huge}_page().
This naming convention is consistent with that between copy_highpage()
and copy_user_highpage().
Signed-off-by: Naoya Horiguchi <[email protected]>
---
include/linux/hugetlb.h | 2 ++
mm/hugetlb.c | 43 +++++++++++++++++++++++++++++++++++++++----
2 files changed, 41 insertions(+), 4 deletions(-)
diff --git linux-mce-hwpoison/include/linux/hugetlb.h linux-mce-hwpoison/include/linux/hugetlb.h
index 0b73c53..f77d2ba 100644
--- linux-mce-hwpoison/include/linux/hugetlb.h
+++ linux-mce-hwpoison/include/linux/hugetlb.h
@@ -44,6 +44,7 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
int acctflags);
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
void __isolate_hwpoisoned_huge_page(struct page *page);
+void copy_huge_page(struct page *dst, struct page *src);
extern unsigned long hugepages_treat_as_movable;
extern const unsigned long hugetlb_zero, hugetlb_infinity;
@@ -102,6 +103,7 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
#define huge_pte_offset(mm, address) 0
#define __isolate_hwpoisoned_huge_page(page) 0
+#define copy_huge_page(dst, src) NULL
#define hugetlb_change_protection(vma, address, end, newprot)
diff --git linux-mce-hwpoison/mm/hugetlb.c linux-mce-hwpoison/mm/hugetlb.c
index 79be5f3..2fb8679 100644
--- linux-mce-hwpoison/mm/hugetlb.c
+++ linux-mce-hwpoison/mm/hugetlb.c
@@ -423,7 +423,7 @@ static void clear_huge_page(struct page *page,
}
}
-static void copy_gigantic_page(struct page *dst, struct page *src,
+static void copy_user_gigantic_page(struct page *dst, struct page *src,
unsigned long addr, struct vm_area_struct *vma)
{
int i;
@@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src,
src = mem_map_next(src, src_base, i);
}
}
-static void copy_huge_page(struct page *dst, struct page *src,
+
+static void copy_user_huge_page(struct page *dst, struct page *src,
unsigned long addr, struct vm_area_struct *vma)
{
int i;
struct hstate *h = hstate_vma(vma);
if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
- copy_gigantic_page(dst, src, addr, vma);
+ copy_user_gigantic_page(dst, src, addr, vma);
return;
}
@@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src,
}
}
+static void copy_gigantic_page(struct page *dst, struct page *src)
+{
+ int i;
+ struct hstate *h = page_hstate(src);
+ struct page *dst_base = dst;
+ struct page *src_base = src;
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page(h); ) {
+ cond_resched();
+ copy_highpage(dst, src);
+
+ i++;
+ dst = mem_map_next(dst, dst_base, i);
+ src = mem_map_next(src, src_base, i);
+ }
+}
+
+void copy_huge_page(struct page *dst, struct page *src)
+{
+ int i;
+ struct hstate *h = page_hstate(src);
+
+ if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
+ copy_gigantic_page(dst, src);
+ return;
+ }
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page(h); i++) {
+ cond_resched();
+ copy_highpage(dst + i, src + i);
+ }
+}
+
static void enqueue_huge_page(struct hstate *h, struct page *page)
{
int nid = page_to_nid(page);
@@ -2437,7 +2472,7 @@ retry_avoidcopy:
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
- copy_huge_page(new_page, old_page, address, vma);
+ copy_user_huge_page(new_page, old_page, address, vma);
__SetPageUptodate(new_page);
/*
--
1.7.2.1
The function name alloc_huge_page_no_vma_node() has verbose suffix "_no_vma".
This patch makes existing alloc_huge_page() and it's family have "_vma" instead,
which makes it easier to read.
Signed-off-by: Naoya Horiguchi <[email protected]>
---
include/linux/hugetlb.h | 4 ++--
mm/hugetlb.c | 20 ++++++++++----------
2 files changed, 12 insertions(+), 12 deletions(-)
diff --git linux-mce-hwpoison/include/linux/hugetlb.h linux-mce-hwpoison/include/linux/hugetlb.h
index 142bd4f..0b73c53 100644
--- linux-mce-hwpoison/include/linux/hugetlb.h
+++ linux-mce-hwpoison/include/linux/hugetlb.h
@@ -228,7 +228,7 @@ struct huge_bootmem_page {
struct hstate *hstate;
};
-struct page *alloc_huge_page_no_vma_node(struct hstate *h, int nid);
+struct page *alloc_huge_page_node(struct hstate *h, int nid);
/* arch callback */
int __init alloc_bootmem_huge_page(struct hstate *h);
@@ -305,7 +305,7 @@ static inline struct hstate *page_hstate(struct page *page)
#else
struct hstate {};
-#define alloc_huge_page_no_vma_node(h, nid) NULL
+#define alloc_huge_page_node(h, nid) NULL
#define alloc_bootmem_huge_page(h) NULL
#define hstate_file(f) NULL
#define hstate_vma(v) NULL
diff --git linux-mce-hwpoison/mm/hugetlb.c linux-mce-hwpoison/mm/hugetlb.c
index 2815b83..79be5f3 100644
--- linux-mce-hwpoison/mm/hugetlb.c
+++ linux-mce-hwpoison/mm/hugetlb.c
@@ -667,7 +667,7 @@ static struct page *alloc_buddy_huge_page_node(struct hstate *h, int nid)
* E.g. soft-offlining uses this function because it only cares physical
* address of error page.
*/
-struct page *alloc_huge_page_no_vma_node(struct hstate *h, int nid)
+struct page *alloc_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
@@ -821,7 +821,7 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
return ret;
}
-static struct page *alloc_buddy_huge_page(struct hstate *h,
+static struct page *alloc_buddy_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
{
struct page *page;
@@ -922,7 +922,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
retry:
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
- page = alloc_buddy_huge_page(h, NULL, 0);
+ page = alloc_buddy_huge_page_vma(h, NULL, 0);
if (!page) {
/*
* We were not able to allocate enough pages to
@@ -1075,8 +1075,8 @@ static void vma_commit_reservation(struct hstate *h,
}
}
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
- unsigned long addr, int avoid_reserve)
+static struct page *alloc_huge_page_vma(struct vm_area_struct *vma,
+ unsigned long addr, int avoid_reserve)
{
struct hstate *h = hstate_vma(vma);
struct page *page;
@@ -1103,7 +1103,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
spin_unlock(&hugetlb_lock);
if (!page) {
- page = alloc_buddy_huge_page(h, vma, addr);
+ page = alloc_buddy_huge_page_vma(h, vma, addr);
if (!page) {
hugetlb_put_quota(inode->i_mapping, chg);
return ERR_PTR(-VM_FAULT_SIGBUS);
@@ -1322,7 +1322,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
* First take pages out of surplus state. Then make up the
* remaining difference by allocating fresh huge pages.
*
- * We might race with alloc_buddy_huge_page() here and be unable
+ * We might race with alloc_buddy_huge_page_vma() here and be unable
* to convert a surplus huge page to a normal huge page. That is
* not critical, though, it just means the overall size of the
* pool might be one hugepage larger than it needs to be, but
@@ -1361,7 +1361,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
* By placing pages into the surplus state independent of the
* overcommit value, we are allowing the surplus pool size to
* exceed overcommit. There are few sane options here. Since
- * alloc_buddy_huge_page() is checking the global counter,
+ * alloc_buddy_huge_page_vma() is checking the global counter,
* though, we'll note that we're not allowed to exceed surplus
* and won't grow the pool anywhere else. Not until one of the
* sysctls are changed, or the surplus pages go out of use.
@@ -2402,7 +2402,7 @@ retry_avoidcopy:
/* Drop page_table_lock as buddy allocator may be called */
spin_unlock(&mm->page_table_lock);
- new_page = alloc_huge_page(vma, address, outside_reserve);
+ new_page = alloc_huge_page_vma(vma, address, outside_reserve);
if (IS_ERR(new_page)) {
page_cache_release(old_page);
@@ -2530,7 +2530,7 @@ retry:
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto out;
- page = alloc_huge_page(vma, address, 0);
+ page = alloc_huge_page_vma(vma, address, 0);
if (IS_ERR(page)) {
ret = -PTR_ERR(page);
goto out;
--
1.7.2.1
Currently unpoisoning hugepages doesn't work because it's not enough
to just clear PG_HWPoison bits and we need to link the hugepage
to be unpoisoned back to the free hugepage list.
To do this, we get and put hwpoisoned hugepage whose refcount is 0.
Signed-off-by: Naoya Horiguchi <[email protected]>
Signed-off-by: Jun'ichi Nomura <[email protected]>
---
mm/memory-failure.c | 16 +++++++++++++---
1 files changed, 13 insertions(+), 3 deletions(-)
diff --git linux-mce-hwpoison/mm/memory-failure.c linux-mce-hwpoison/mm/memory-failure.c
index 0bfe5b3..1f54901 100644
--- linux-mce-hwpoison/mm/memory-failure.c
+++ linux-mce-hwpoison/mm/memory-failure.c
@@ -1153,9 +1153,19 @@ int unpoison_memory(unsigned long pfn)
nr_pages = 1 << compound_order(page);
if (!get_page_unless_zero(page)) {
- if (TestClearPageHWPoison(p))
+ /* The page to be unpoisoned was free one when hwpoisoned */
+ if (TestClearPageHWPoison(page))
atomic_long_sub(nr_pages, &mce_bad_pages);
pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
+ if (PageHuge(page)) {
+ /*
+ * To unpoison free hugepage, we get and put it
+ * to move it back to the free list.
+ */
+ get_page(page);
+ clear_page_hwpoison_huge_page(page);
+ put_page(page);
+ }
return 0;
}
@@ -1170,9 +1180,9 @@ int unpoison_memory(unsigned long pfn)
pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
atomic_long_sub(nr_pages, &mce_bad_pages);
freeit = 1;
+ if (PageHuge(page))
+ clear_page_hwpoison_huge_page(page);
}
- if (PageHuge(p))
- clear_page_hwpoison_huge_page(page);
unlock_page(page);
put_page(page);
--
1.7.2.1
debugfs:hwpoison/renew-pfn is the old interface.
This patch renames and fixes it.
Signed-off-by: Naoya Horiguchi <[email protected]>
---
Documentation/vm/page-types.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git linux-mce-hwpoison/Documentation/vm/page-types.c linux-mce-hwpoison/Documentation/vm/page-types.c
index 66e9358..f120ed0 100644
--- linux-mce-hwpoison/Documentation/vm/page-types.c
+++ linux-mce-hwpoison/Documentation/vm/page-types.c
@@ -478,7 +478,7 @@ static void prepare_hwpoison_fd(void)
}
if (opt_unpoison && !hwpoison_forget_fd) {
- sprintf(buf, "%s/renew-pfn", hwpoison_debug_fs);
+ sprintf(buf, "%s/unpoison-pfn", hwpoison_debug_fs);
hwpoison_forget_fd = checked_open(buf, O_WRONLY);
}
}
--
1.7.2.1
This patch adds "HugePages_Crpt:" line in /proc/meminfo like below:
# cat /proc/meminfo |grep -e Huge -e Corrupt
HardwareCorrupted: 6144 kB
HugePages_Total: 8
HugePages_Free: 5
HugePages_Rsvd: 0
HugePages_Surp: 0
HugePages_Crpt: 3
Hugepagesize: 2048 kB
Signed-off-by: Naoya Horiguchi <[email protected]>
---
include/linux/hugetlb.h | 5 +++++
mm/hugetlb.c | 19 +++++++++++++++++++
mm/memory-failure.c | 2 ++
3 files changed, 26 insertions(+), 0 deletions(-)
diff --git linux-mce-hwpoison/include/linux/hugetlb.h linux-mce-hwpoison/include/linux/hugetlb.h
index 2b7de04..c7b4dae 100644
--- linux-mce-hwpoison/include/linux/hugetlb.h
+++ linux-mce-hwpoison/include/linux/hugetlb.h
@@ -45,6 +45,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
void __isolate_hwpoisoned_huge_page(struct page *page);
void isolate_hwpoisoned_huge_page(struct page *page);
+void increment_corrupted_huge_page(struct page *page);
+void decrement_corrupted_huge_page(struct page *page);
void copy_huge_page(struct page *dst, struct page *src);
extern unsigned long hugepages_treat_as_movable;
@@ -105,6 +107,8 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
#define huge_pte_offset(mm, address) 0
#define __isolate_hwpoisoned_huge_page(page) 0
#define isolate_hwpoisoned_huge_page(page) 0
+#define increment_corrupted_huge_page(page) 0
+#define decrement_corrupted_huge_page(page) 0
#define copy_huge_page(dst, src) NULL
#define hugetlb_change_protection(vma, address, end, newprot)
@@ -220,6 +224,7 @@ struct hstate {
unsigned long resv_huge_pages;
unsigned long surplus_huge_pages;
unsigned long nr_overcommit_huge_pages;
+ unsigned long corrupted_huge_pages;
struct list_head hugepage_freelists[MAX_NUMNODES];
unsigned int nr_huge_pages_node[MAX_NUMNODES];
unsigned int free_huge_pages_node[MAX_NUMNODES];
diff --git linux-mce-hwpoison/mm/hugetlb.c linux-mce-hwpoison/mm/hugetlb.c
index 2a61a8f..122790b 100644
--- linux-mce-hwpoison/mm/hugetlb.c
+++ linux-mce-hwpoison/mm/hugetlb.c
@@ -2040,11 +2040,13 @@ void hugetlb_report_meminfo(struct seq_file *m)
"HugePages_Free: %5lu\n"
"HugePages_Rsvd: %5lu\n"
"HugePages_Surp: %5lu\n"
+ "HugePages_Crpt: %5lu\n"
"Hugepagesize: %8lu kB\n",
h->nr_huge_pages,
h->free_huge_pages,
h->resv_huge_pages,
h->surplus_huge_pages,
+ h->corrupted_huge_pages,
1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
}
@@ -2980,6 +2982,23 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
hugetlb_acct_memory(h, -(chg - freed));
}
+void increment_corrupted_huge_page(struct page *hpage)
+{
+ struct hstate *h = page_hstate(hpage);
+ spin_lock(&hugetlb_lock);
+ h->corrupted_huge_pages++;
+ spin_unlock(&hugetlb_lock);
+}
+
+void decrement_corrupted_huge_page(struct page *hpage)
+{
+ struct hstate *h = page_hstate(hpage);
+ spin_lock(&hugetlb_lock);
+ BUG_ON(!h->corrupted_huge_pages);
+ h->corrupted_huge_pages--;
+ spin_unlock(&hugetlb_lock);
+}
+
/*
* This function is called from memory failure code.
* Assume the caller holds page lock of the head page.
diff --git linux-mce-hwpoison/mm/memory-failure.c linux-mce-hwpoison/mm/memory-failure.c
index 1f54901..1e9794d 100644
--- linux-mce-hwpoison/mm/memory-failure.c
+++ linux-mce-hwpoison/mm/memory-failure.c
@@ -938,6 +938,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
int nr_pages = 1 << compound_order(hpage);
for (i = 0; i < nr_pages; i++)
SetPageHWPoison(hpage + i);
+ increment_corrupted_huge_page(hpage);
}
static void clear_page_hwpoison_huge_page(struct page *hpage)
@@ -946,6 +947,7 @@ static void clear_page_hwpoison_huge_page(struct page *hpage)
int nr_pages = 1 << compound_order(hpage);
for (i = 0; i < nr_pages; i++)
ClearPageHWPoison(hpage + i);
+ decrement_corrupted_huge_page(hpage);
}
int __memory_failure(unsigned long pfn, int trapno, int flags)
--
1.7.2.1
On Tue, 10 Aug 2010, Naoya Horiguchi wrote:
> There were two points of issue.
>
> * Dividing hugepage migration functions from original migration code.
> This is to avoid complexity.
> In present version, some high level migration routines are defined to handle
> hugepage, but some low level routines (such as migrate_copy_page() etc.)
> are shared with original migration code in order not to increase duplication.
I hoped that we can avoid the branching for taking stuff off the lru and
put pages back later to the lru. Seems that we still do that. Can be
refactor the code in such a way that the lru handling cleanly isolates?
There are now multiple use cases for migration that could avoid LRU
handling even for PAGE_SIZE pages.
> * Locking problem between direct I/O and hugepage migration
> As a result of digging the race between hugepage I/O and hugepage migration,
> (where hugepage I/O can be seen only in direct I/O,)
> I noticed that without additional locking we can avoid this race condition
> because in direct I/O we can get whether some subpages are under I/O or not
> from reference count of the head page and hugepage migration safely fails
> if some references remain. So no data lost should occurs on the migration
> concurrent with direct I/O.
Can you also avoid refcounts being increased during migration? The page
lock is taken for the PAGE_SIZEd migration case. Can direct I/O be stopped
by taking the page lock on the head page? If not then races can still
occur.
On Wed, Aug 11, 2010 at 08:09:59AM -0500, Christoph Lameter wrote:
> On Tue, 10 Aug 2010, Naoya Horiguchi wrote:
>
> > There were two points of issue.
> >
> > * Dividing hugepage migration functions from original migration code.
> > This is to avoid complexity.
> > In present version, some high level migration routines are defined to handle
> > hugepage, but some low level routines (such as migrate_copy_page() etc.)
> > are shared with original migration code in order not to increase duplication.
>
> I hoped that we can avoid the branching for taking stuff off the lru and
> put pages back later to the lru. Seems that we still do that. Can be
> refactor the code in such a way that the lru handling cleanly isolates?
> There are now multiple use cases for migration that could avoid LRU
> handling even for PAGE_SIZE pages.
OK.
I'll rewrite isolation_lru_page() and putback_lru_page() like this:
isolation_lru_page()
for PAGE_SIZE page : delete from LRU list and get count
for hugepage : just get count
putback_lru_page()
for PAGE_SIZE page : add to LRU list and put count
for hugepage : just put count
By doing this, we can avoid ugly code in individual migration callers.
> > * Locking problem between direct I/O and hugepage migration
> > As a result of digging the race between hugepage I/O and hugepage migration,
> > (where hugepage I/O can be seen only in direct I/O,)
> > I noticed that without additional locking we can avoid this race condition
> > because in direct I/O we can get whether some subpages are under I/O or not
> > from reference count of the head page and hugepage migration safely fails
> > if some references remain. So no data lost should occurs on the migration
> > concurrent with direct I/O.
>
> Can you also avoid refcounts being increased during migration?
Yes. I think this will be done in above-mentioned refactoring.
> The page
> lock is taken for the PAGE_SIZEd migration case. Can direct I/O be stopped
> by taking the page lock on the head page? If not then races can still
> occur.
Ah. I missed it.
This patch only handles migration under direct I/O.
For the opposite (direct I/O under migration) it's not true.
I wrote additional patches (later I'll reply to this email)
for solving locking problem. Could you review them?
(Maybe these patches are beyond the scope of hugepage migration patch,
so is it better to propose them separately?)
Thanks,
Naoya Horiguchi
Basically it is user's responsibility to take care of race condition
related to direct I/O, but some events which are out of user's control
(such as memory failure) can happen at any time. So we need to lock and
set/clear PG_writeback flags in dierct I/O code to protect from data loss.
Signed-off-by: Naoya Horiguchi <[email protected]>
---
fs/direct-io.c | 8 +++++++-
1 files changed, 7 insertions(+), 1 deletions(-)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7600aac..0d0810d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -439,7 +439,10 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
struct page *page = bvec[page_no].bv_page;
if (dio->rw == READ && !PageCompound(page))
- set_page_dirty_lock(page);
+ set_page_dirty(page);
+ if (dio->rw & WRITE)
+ end_page_writeback(page);
+ unlock_page(page);
page_cache_release(page);
}
bio_put(bio);
@@ -702,11 +705,14 @@ submit_page_section(struct dio *dio, struct page *page,
{
int ret = 0;
+ lock_page(page);
+
if (dio->rw & WRITE) {
/*
* Read accounting is performed in submit_bio()
*/
task_io_account_write(len);
+ set_page_writeback(page);
}
/*
--
1.7.2.1
Signed-off-by: Naoya Horiguchi <[email protected]>
---
mm/memory-failure.c | 20 ++++++++++----------
1 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e387098..2eb740e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1052,7 +1052,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* It's very difficult to mess with pages currently under IO
* and in many cases impossible, so we just avoid it here.
*/
- lock_page_nosync(hpage);
+ lock_page_against_memory_failure(hpage);
/*
* unpoison always clear PG_hwpoison inside page lock
@@ -1065,7 +1065,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
atomic_long_sub(nr_pages, &mce_bad_pages);
- unlock_page(hpage);
+ unlock_page_against_memory_failure(hpage);
put_page(hpage);
return 0;
}
@@ -1077,7 +1077,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
if (PageTail(p) && TestSetPageHWPoison(hpage)) {
action_result(pfn, "hugepage already hardware poisoned",
IGNORED);
- unlock_page(hpage);
+ unlock_page_against_memory_failure(hpage);
put_page(hpage);
return 0;
}
@@ -1090,7 +1090,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
if (PageHuge(p))
set_page_hwpoison_huge_page(hpage);
- wait_on_page_writeback(p);
+ wait_on_pages_writeback_against_memory_failure(hpage);
/*
* Now take care of user space mappings.
@@ -1119,7 +1119,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
}
}
out:
- unlock_page(hpage);
+ unlock_page_against_memory_failure(hpage);
return res;
}
EXPORT_SYMBOL_GPL(__memory_failure);
@@ -1195,7 +1195,7 @@ int unpoison_memory(unsigned long pfn)
return 0;
}
- lock_page_nosync(page);
+ lock_page_against_memory_failure(page);
/*
* This test is racy because PG_hwpoison is set outside of page lock.
* That's acceptable because that won't trigger kernel panic. Instead,
@@ -1209,7 +1209,7 @@ int unpoison_memory(unsigned long pfn)
if (PageHuge(page))
clear_page_hwpoison_huge_page(page);
}
- unlock_page(page);
+ unlock_page_against_memory_failure(page);
put_page(page);
if (freeit)
@@ -1341,14 +1341,14 @@ int soft_offline_page(struct page *page, int flags)
return -EIO;
}
- lock_page(hpage);
- wait_on_page_writeback(hpage);
+ lock_page_against_memory_failure(hpage);
+ wait_on_pages_writeback_against_memory_failure(hpage);
/*
* Synchronized using the page lock with memory_failure()
*/
if (PageHWPoison(hpage)) {
- unlock_page(hpage);
+ unlock_page_against_memory_failure(hpage);
put_page(hpage);
pr_debug("soft offline: %#lx page already poisoned\n", pfn);
return -EBUSY;
--
1.7.2.1
For the migration of PAGE_SIZE pages, we can choose to continue to do
migration with "force" switch even if the old page has page lock held.
But for hugepage, I/O of subpages are not necessarily completed
in ascending order, which can cause race.
So we make migration fail then for safety.
Signed-off-by: Naoya Horiguchi <[email protected]>
---
mm/migrate.c | 15 +++++++++------
1 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/mm/migrate.c b/mm/migrate.c
index 7f9a37c..43347e1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -820,11 +820,14 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
rc = -EAGAIN;
- if (!trylock_page(hpage)) {
- if (!force)
- goto out;
- lock_page(hpage);
- }
+ /*
+ * If some subpages are locked, it can cause race condition.
+ * So then we return from migration and try again.
+ */
+ if (!trylock_huge_page(hpage))
+ goto out;
+
+ wait_on_huge_page_writeback(hpage);
if (PageAnon(hpage)) {
rcu_read_lock();
@@ -855,7 +858,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (rcu_locked)
rcu_read_unlock();
out:
- unlock_page(hpage);
+ unlock_huge_page(hpage);
if (rc != -EAGAIN)
put_page(hpage);
--
1.7.2.1
This patch defines some helper functions to avoid race condition
on hugepage I/O. We assume that locking/unlocking subpages are
done in ascending/descending order in adderss to avoid deadlock.
Signed-off-by: Naoya Horiguchi <[email protected]>
---
include/linux/hugetlb.h | 55 +++++++++++++++++++++++++++++++++++++++++++++++
mm/memory-failure.c | 24 ++++++++++++++++++++
2 files changed, 79 insertions(+), 0 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index c7b4dae..dabed89 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -312,6 +312,55 @@ static inline struct hstate *page_hstate(struct page *page)
return size_to_hstate(PAGE_SIZE << compound_order(page));
}
+/*
+ * Locking functions for hugepage.
+ * We assume that locking/unlocking subpages are done
+ * in ascending/descending order in adderss to avoid deadlock.
+ */
+
+/* If no subpage is locked, return 1. Otherwise, return 0. */
+static inline int trylock_huge_page(struct page *page)
+{
+ int i;
+ int ret = 1;
+ int nr_pages = pages_per_huge_page(page_hstate(page));
+ for (i = 0; i < nr_pages; i++)
+ ret &= trylock_page(page + i);
+ return ret;
+}
+
+static inline void lock_huge_page(struct page *page)
+{
+ int i;
+ int nr_pages = pages_per_huge_page(page_hstate(page));
+ for (i = 0; i < nr_pages; i++)
+ lock_page(page + i);
+}
+
+static inline void lock_huge_page_nosync(struct page *page)
+{
+ int i;
+ int nr_pages = pages_per_huge_page(page_hstate(page));
+ for (i = 0; i < nr_pages; i++)
+ lock_page_nosync(page + i);
+}
+
+static inline void unlock_huge_page(struct page *page)
+{
+ int i;
+ int nr_pages = pages_per_huge_page(page_hstate(page));
+ for (i = nr_pages - 1; i >= 0; i--)
+ unlock_page(page + i);
+}
+
+static inline void wait_on_huge_page_writeback(struct page *page)
+{
+ int i;
+ int nr_pages = pages_per_huge_page(page_hstate(page));
+ for (i = 0; i < nr_pages; i++)
+ wait_on_page_writeback(page + i);
+}
+
#else
struct hstate {};
#define alloc_huge_page_node(h, nid) NULL
@@ -329,6 +378,12 @@ static inline unsigned int pages_per_huge_page(struct hstate *h)
{
return 1;
}
+#define page_hstate(p) NULL
+#define trylock_huge_page(p) NULL
+#define lock_huge_page(p) NULL
+#define lock_huge_page_nosync(p) NULL
+#define unlock_huge_page(p) NULL
+#define wait_on_huge_page_writeback(p) NULL
#endif
#endif /* _LINUX_HUGETLB_H */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1e9794d..e387098 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -950,6 +950,30 @@ static void clear_page_hwpoison_huge_page(struct page *hpage)
decrement_corrupted_huge_page(hpage);
}
+static void lock_page_against_memory_failure(struct page *p)
+{
+ if (PageHuge(p))
+ lock_huge_page_nosync(p);
+ else
+ lock_page_nosync(p);
+}
+
+static void unlock_page_against_memory_failure(struct page *p)
+{
+ if (PageHuge(p))
+ unlock_huge_page(p);
+ else
+ unlock_page(p);
+}
+
+static void wait_on_pages_writeback_against_memory_failure(struct page *p)
+{
+ if (PageHuge(p))
+ wait_on_huge_page_writeback(p);
+ else
+ wait_on_page_writeback(p);
+}
+
int __memory_failure(unsigned long pfn, int trapno, int flags)
{
struct page_state *ps;
--
1.7.2.1
Naoya Horiguchi <[email protected]> writes:
> Basically it is user's responsibility to take care of race condition
> related to direct I/O, but some events which are out of user's control
> (such as memory failure) can happen at any time. So we need to lock and
> set/clear PG_writeback flags in dierct I/O code to protect from data loss.
Did you do any performance testing of this? If not, please do and
report back. I'm betting users won't be pleased with the results.
Cheers,
Jeff
>
> Signed-off-by: Naoya Horiguchi <[email protected]>
> ---
> fs/direct-io.c | 8 +++++++-
> 1 files changed, 7 insertions(+), 1 deletions(-)
>
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 7600aac..0d0810d 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -439,7 +439,10 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
> struct page *page = bvec[page_no].bv_page;
>
> if (dio->rw == READ && !PageCompound(page))
> - set_page_dirty_lock(page);
> + set_page_dirty(page);
> + if (dio->rw & WRITE)
> + end_page_writeback(page);
> + unlock_page(page);
> page_cache_release(page);
> }
> bio_put(bio);
> @@ -702,11 +705,14 @@ submit_page_section(struct dio *dio, struct page *page,
> {
> int ret = 0;
>
> + lock_page(page);
> +
> if (dio->rw & WRITE) {
> /*
> * Read accounting is performed in submit_bio()
> */
> task_io_account_write(len);
> + set_page_writeback(page);
> }
>
> /*
On Thu, 12 Aug 2010, Naoya Horiguchi wrote:
> > Can you also avoid refcounts being increased during migration?
>
> Yes. I think this will be done in above-mentioned refactoring.
Thats not what I meant. Can you avoid other processors increasing
refcounts (direct I/O etc?) on any page struct of the huge page while
migration is running?
> This patch only handles migration under direct I/O.
> For the opposite (direct I/O under migration) it's not true.
> I wrote additional patches (later I'll reply to this email)
> for solving locking problem. Could you review them?
Sure.
> (Maybe these patches are beyond the scope of hugepage migration patch,
> so is it better to propose them separately?)
Migration with known races is really not what we want in the kernel.
Hi,
On Thu, Aug 12, 2010 at 09:42:21AM -0400, Jeff Moyer wrote:
> Naoya Horiguchi <[email protected]> writes:
>
> > Basically it is user's responsibility to take care of race condition
> > related to direct I/O, but some events which are out of user's control
> > (such as memory failure) can happen at any time. So we need to lock and
> > set/clear PG_writeback flags in dierct I/O code to protect from data loss.
>
> Did you do any performance testing of this? If not, please do and
> report back. I'm betting users won't be pleased with the results.
Here is the result of my direct I/O benchmarck, which mesures the time
it takes to do direct I/O for 20000 pages on 2MB buffer for four types
of I/O. Each I/O is issued for one page unit and each number below is
the average of 25 runs.
with patchset 2.6.35-rc3
Buffer I/O type average(s) STD(s) average(s) STD(s) diff(s)
hugepage Sequential Read 3.87 0.16 3.88 0.20 -0.01
Sequential Write 7.69 0.43 7.69 0.43 0.00
Random Read 5.93 1.58 6.49 1.45 -0.55
Random Write 13.50 0.28 13.41 0.30 0.09
anonymous Sequential Read 3.88 0.21 3.89 0.23 -0.01
Sequential Write 7.86 0.39 7.80 0.34 0.05
Random Read 7.67 1.60 6.86 1.27 0.80
Random Write 13.50 0.25 13.52 0.31 -0.01
>From this result, although fluctuation is relatively large for random read,
differences between vanilla kernel and patched one are within the deviations and
it seems that adding direct I/O lock makes little or no impact on performance.
And I know the workload of this benchmark can be too simple,
so please let me know if you think we have another workload to be looked into.
Thanks,
Naoya Horiguchi
> And I know the workload of this benchmark can be too simple,
> so please let me know if you think we have another workload to be looked into.
I would try it with some of the fio workfiles that use O_DIRECT,
especially the parallel ones. Perhaps people can share their favourite one.
-Andi
--
[email protected] -- Speaking for myself only.
On Fri, Aug 13, 2010 at 07:47:21AM -0500, Christoph Lameter wrote:
> On Thu, 12 Aug 2010, Naoya Horiguchi wrote:
>
> > > Can you also avoid refcounts being increased during migration?
> >
> > Yes. I think this will be done in above-mentioned refactoring.
>
> Thats not what I meant. Can you avoid other processors increasing
> refcounts (direct I/O etc?) on any page struct of the huge page while
> migration is running?
In my understanding, in current code "other processors increasing refcount
during migration" can happen both in non-hugepage direct I/O and in hugepage
direct I/O in the similar way (i.e. get_user_pages_fast() from dio_refill_pages()).
So I think there is no specific problem to hugepage.
Or am I missing your point?
>
> > This patch only handles migration under direct I/O.
> > For the opposite (direct I/O under migration) it's not true.
> > I wrote additional patches (later I'll reply to this email)
> > for solving locking problem. Could you review them?
>
> Sure.
>
> > (Maybe these patches are beyond the scope of hugepage migration patch,
> > so is it better to propose them separately?)
>
> Migration with known races is really not what we want in the kernel.
Yes.
Thanks,
Naoya Horiguchi
On Mon, 16 Aug 2010, Naoya Horiguchi wrote:
> In my understanding, in current code "other processors increasing refcount
> during migration" can happen both in non-hugepage direct I/O and in hugepage
> direct I/O in the similar way (i.e. get_user_pages_fast() from dio_refill_pages()).
> So I think there is no specific problem to hugepage.
> Or am I missing your point?
With a single page there is the check of the refcount during migration
after all the references have been removed (at that point the page is no
longer mapped by any process and direct iO can no longer be
initiated without a page fault.
I see that you are running try_to_unmap() from unmap_and_move_huge_page().
I dont see a patch adding huge page support to try_to_unmap though. How
does this work?
Naoya Horiguchi <[email protected]> writes:
> Hi,
>
> On Thu, Aug 12, 2010 at 09:42:21AM -0400, Jeff Moyer wrote:
>> Naoya Horiguchi <[email protected]> writes:
>>
>> > Basically it is user's responsibility to take care of race condition
>> > related to direct I/O, but some events which are out of user's control
>> > (such as memory failure) can happen at any time. So we need to lock and
>> > set/clear PG_writeback flags in dierct I/O code to protect from data loss.
>>
>> Did you do any performance testing of this? If not, please do and
>> report back. I'm betting users won't be pleased with the results.
>
> Here is the result of my direct I/O benchmarck, which mesures the time
> it takes to do direct I/O for 20000 pages on 2MB buffer for four types
> of I/O. Each I/O is issued for one page unit and each number below is
> the average of 25 runs.
>
> with patchset 2.6.35-rc3
> Buffer I/O type average(s) STD(s) average(s) STD(s) diff(s)
> hugepage Sequential Read 3.87 0.16 3.88 0.20 -0.01
> Sequential Write 7.69 0.43 7.69 0.43 0.00
> Random Read 5.93 1.58 6.49 1.45 -0.55
> Random Write 13.50 0.28 13.41 0.30 0.09
> anonymous Sequential Read 3.88 0.21 3.89 0.23 -0.01
> Sequential Write 7.86 0.39 7.80 0.34 0.05
> Random Read 7.67 1.60 6.86 1.27 0.80
> Random Write 13.50 0.25 13.52 0.31 -0.01
>
> From this result, although fluctuation is relatively large for random read,
> differences between vanilla kernel and patched one are within the deviations and
> it seems that adding direct I/O lock makes little or no impact on performance.
First, thanks for doing the testing!
> And I know the workload of this benchmark can be too simple, so please
> let me know if you think we have another workload to be looked into.
Well, as distasteful as this sounds, I think a benchmark that does I/O
to partial pages would show the problem best. And yes, this does happen
in the real world. ;-) So, sequential 512 byte or 1k or 2k I/Os, or
just misalign larger I/Os so that two sequential I/Os will hit the same
page.
I believe you can use fio to generate such a workload; see iomem_align
in the man page. Something like the below *might* work. If not, then
simply changing the bs=4k to bs=2k and getting rid of iomem_align should
show the problem.
Cheers,
Jeff
[global]
ioengine=libaio
iodepth=32
bs=4k
direct=1
size=2g
overwrite=1
[test1]
rw=write
iomem_align=2k
On Mon, Aug 16, 2010 at 07:19:58AM -0500, Christoph Lameter wrote:
> On Mon, 16 Aug 2010, Naoya Horiguchi wrote:
>
> > In my understanding, in current code "other processors increasing refcount
> > during migration" can happen both in non-hugepage direct I/O and in hugepage
> > direct I/O in the similar way (i.e. get_user_pages_fast() from dio_refill_pages()).
> > So I think there is no specific problem to hugepage.
> > Or am I missing your point?
>
> With a single page there is the check of the refcount during migration
> after all the references have been removed (at that point the page is no
> longer mapped by any process and direct iO can no longer be
> initiated without a page fault.
The same checking mechanism works for hugeapge.
>
> I see that you are running try_to_unmap() from unmap_and_move_huge_page().
Yes, that's right.
>
> I dont see a patch adding huge page support to try_to_unmap though. How
> does this work?
I previously posted "hugetlb, rmap: add reverse mapping for hugepage" patch
which enables try_to_unmap() to work on hugepage by enabling to handle
anon_vma and mapcount for hugepage. For details refer to the following commit:
commit 0fe6e20b9c4c53b3e97096ee73a0857f60aad43f
Author: Naoya Horiguchi <[email protected]>
Date: Fri May 28 09:29:16 2010 +0900
hugetlb, rmap: add reverse mapping for hugepage
(Current "Hugepage migration" patchset is based on 2.6.35-rc3.
So I'll rebase it onto the latest release in the next post.)
Thanks,
Naoya Horiguchi
On Tue, 10 Aug 2010, Naoya Horiguchi wrote:
> diff --git linux-mce-hwpoison/include/linux/hugetlb.h linux-mce-hwpoison/include/linux/hugetlb.h
> index f479700..142bd4f 100644
> --- linux-mce-hwpoison/include/linux/hugetlb.h
> +++ linux-mce-hwpoison/include/linux/hugetlb.h
> @@ -228,6 +228,8 @@ struct huge_bootmem_page {
> struct hstate *hstate;
> };
>
> +struct page *alloc_huge_page_no_vma_node(struct hstate *h, int nid);
> +
> /* arch callback */
> int __init alloc_bootmem_huge_page(struct hstate *h);
>
> @@ -303,6 +305,7 @@ static inline struct hstate *page_hstate(struct page *page)
>
> #else
> struct hstate {};
> +#define alloc_huge_page_no_vma_node(h, nid) NULL
> #define alloc_bootmem_huge_page(h) NULL
> #define hstate_file(f) NULL
> #define hstate_vma(v) NULL
> diff --git linux-mce-hwpoison/mm/hugetlb.c linux-mce-hwpoison/mm/hugetlb.c
> index 5c77a73..2815b83 100644
> --- linux-mce-hwpoison/mm/hugetlb.c
> +++ linux-mce-hwpoison/mm/hugetlb.c
> @@ -466,11 +466,22 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
> h->free_huge_pages_node[nid]++;
> }
>
> +static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
> +{
> + struct page *page;
> + if (list_empty(&h->hugepage_freelists[nid]))
> + return NULL;
> + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
> + list_del(&page->lru);
> + h->free_huge_pages--;
> + h->free_huge_pages_node[nid]--;
> + return page;
> +}
> +
> static struct page *dequeue_huge_page_vma(struct hstate *h,
> struct vm_area_struct *vma,
> unsigned long address, int avoid_reserve)
> {
> - int nid;
> struct page *page = NULL;
> struct mempolicy *mpol;
> nodemask_t *nodemask;
> @@ -496,19 +507,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
>
> for_each_zone_zonelist_nodemask(zone, z, zonelist,
> MAX_NR_ZONES - 1, nodemask) {
> - nid = zone_to_nid(zone);
> - if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
> - !list_empty(&h->hugepage_freelists[nid])) {
> - page = list_entry(h->hugepage_freelists[nid].next,
> - struct page, lru);
> - list_del(&page->lru);
> - h->free_huge_pages--;
> - h->free_huge_pages_node[nid]--;
> -
> - if (!avoid_reserve)
> - decrement_hugepage_resv_vma(h, vma);
> -
> - break;
> + if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
> + page = dequeue_huge_page_node(h, zone_to_nid(zone));
> + if (page) {
> + if (!avoid_reserve)
> + decrement_hugepage_resv_vma(h, vma);
> + break;
> + }
> }
> }
> err:
> @@ -616,7 +621,7 @@ int PageHuge(struct page *page)
> }
> EXPORT_SYMBOL_GPL(PageHuge);
>
> -static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
> +static struct page *__alloc_huge_page_node(struct hstate *h, int nid)
> {
> struct page *page;
>
> @@ -627,14 +632,61 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
> htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
> __GFP_REPEAT|__GFP_NOWARN,
> huge_page_order(h));
> + if (page && arch_prepare_hugepage(page)) {
> + __free_pages(page, huge_page_order(h));
> + return NULL;
> + }
> +
> + return page;
> +}
> +
> +static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
> +{
> + struct page *page = __alloc_huge_page_node(h, nid);
> + if (page)
> + prep_new_huge_page(h, page, nid);
> + return page;
> +}
> +
> +static struct page *alloc_buddy_huge_page_node(struct hstate *h, int nid)
> +{
> + struct page *page = __alloc_huge_page_node(h, nid);
> if (page) {
> - if (arch_prepare_hugepage(page)) {
> - __free_pages(page, huge_page_order(h));
> + set_compound_page_dtor(page, free_huge_page);
> + spin_lock(&hugetlb_lock);
> + h->nr_huge_pages++;
> + h->nr_huge_pages_node[nid]++;
> + spin_unlock(&hugetlb_lock);
> + put_page_testzero(page);
> + }
> + return page;
> +}
> +
> +/*
> + * This allocation function is useful in the context where vma is irrelevant.
> + * E.g. soft-offlining uses this function because it only cares physical
> + * address of error page.
> + */
> +struct page *alloc_huge_page_no_vma_node(struct hstate *h, int nid)
> +{
> + struct page *page;
> +
> + spin_lock(&hugetlb_lock);
> + get_mems_allowed();
Why is this calling get_mems_allowed()? dequeue_huge_page_node() isn't
concerned if nid can be allocated by current in this context.
> + page = dequeue_huge_page_node(h, nid);
> + put_mems_allowed();
> + spin_unlock(&hugetlb_lock);
> +
> + if (!page) {
> + page = alloc_buddy_huge_page_node(h, nid);
> + if (!page) {
> + __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
> return NULL;
> - }
> - prep_new_huge_page(h, page, nid);
> + } else
> + __count_vm_event(HTLB_BUDDY_PGALLOC);
> }
>
> + set_page_refcounted(page);
Possibility of NULL pointer dereference?
> return page;
> }
>
On Tue, Aug 17, 2010 at 11:37:19AM +0900, Naoya Horiguchi wrote:
> On Mon, Aug 16, 2010 at 07:19:58AM -0500, Christoph Lameter wrote:
> > On Mon, 16 Aug 2010, Naoya Horiguchi wrote:
> >
> > > In my understanding, in current code "other processors increasing refcount
> > > during migration" can happen both in non-hugepage direct I/O and in hugepage
> > > direct I/O in the similar way (i.e. get_user_pages_fast() from dio_refill_pages()).
> > > So I think there is no specific problem to hugepage.
> > > Or am I missing your point?
> >
> > With a single page there is the check of the refcount during migration
> > after all the references have been removed (at that point the page is no
> > longer mapped by any process and direct iO can no longer be
> > initiated without a page fault.
>
> The same checking mechanism works for hugeapge.
So, my previous comment below was not correct:
>>> This patch only handles migration under direct I/O.
>>> For the opposite (direct I/O under migration) it's not true.
>>> I wrote additional patches (later I'll reply to this email)
>>> for solving locking problem. Could you review them?
The hugepage migration patchset should work fine without the
additional page locking patch.
Please ignore the additional page locking patch-set
and review the hugepage migration patch-set only.
Sorry for confusion.
I explain below why the page lock in direct I/O is not needed to avoid
race with migration. This is true for both hugepage and non-huge page.
Race between page migration and direct I/O is in essense the one between
try_to_unmap() in unmap_and_move() and get_user_pages_fast() in dio_get_page().
When try_to_unmap() is called before get_user_pages_fast(),
all ptes pointing to the page to be migrated are replaced to migration
swap entries, so direct I/O code experiences page fault.
In the page fault, the kernel finds migration swap entry and waits the page lock
(which was held by migration code before try_to_unmap()) to be unlocked
in migration_entry_wait(), so direct I/O blocks until migration completes.
When get_user_pages_fast() is called before try_to_unmap(),
direct I/O code increments refcount on the target page.
Because this refcount is not associated to the mapping,
migration code will find remaining refcounts after try_to_unmap()
unmaps all mappings. Then refcount check decides migration to fail,
so direct I/O is continued safely.
Thanks,
Naoya Horiguchi
On Mon, Aug 16, 2010 at 09:20:05AM -0400, Jeff Moyer wrote:
> Naoya Horiguchi <[email protected]> writes:
>
> > Hi,
> >
> > On Thu, Aug 12, 2010 at 09:42:21AM -0400, Jeff Moyer wrote:
> >> Naoya Horiguchi <[email protected]> writes:
> >>
> >> > Basically it is user's responsibility to take care of race condition
> >> > related to direct I/O, but some events which are out of user's control
> >> > (such as memory failure) can happen at any time. So we need to lock and
> >> > set/clear PG_writeback flags in dierct I/O code to protect from data loss.
> >>
> >> Did you do any performance testing of this? If not, please do and
> >> report back. I'm betting users won't be pleased with the results.
> >
> > Here is the result of my direct I/O benchmarck, which mesures the time
> > it takes to do direct I/O for 20000 pages on 2MB buffer for four types
> > of I/O. Each I/O is issued for one page unit and each number below is
> > the average of 25 runs.
> >
> > with patchset 2.6.35-rc3
> > Buffer I/O type average(s) STD(s) average(s) STD(s) diff(s)
> > hugepage Sequential Read 3.87 0.16 3.88 0.20 -0.01
> > Sequential Write 7.69 0.43 7.69 0.43 0.00
> > Random Read 5.93 1.58 6.49 1.45 -0.55
> > Random Write 13.50 0.28 13.41 0.30 0.09
> > anonymous Sequential Read 3.88 0.21 3.89 0.23 -0.01
> > Sequential Write 7.86 0.39 7.80 0.34 0.05
> > Random Read 7.67 1.60 6.86 1.27 0.80
> > Random Write 13.50 0.25 13.52 0.31 -0.01
> >
> > From this result, although fluctuation is relatively large for random read,
> > differences between vanilla kernel and patched one are within the deviations and
> > it seems that adding direct I/O lock makes little or no impact on performance.
>
> First, thanks for doing the testing!
>
> > And I know the workload of this benchmark can be too simple, so please
> > let me know if you think we have another workload to be looked into.
>
> Well, as distasteful as this sounds, I think a benchmark that does I/O
> to partial pages would show the problem best. And yes, this does happen
> in the real world. ;-) So, sequential 512 byte or 1k or 2k I/Os, or
> just misalign larger I/Os so that two sequential I/Os will hit the same
> page.
>
> I believe you can use fio to generate such a workload; see iomem_align
> in the man page. Something like the below *might* work. If not, then
> simply changing the bs=4k to bs=2k and getting rid of iomem_align should
> show the problem.
Thank you for information.
I measured direct I/O performance with small blocksize or misaligned setup.
The result is shown here:
average bandwidth
with patchset 2.6.35-rc3 diff
bs=512 1,412KB/s 1,789KB/s -26.6%
bs=1k 2,973KB/s 3,440KB/s -13.6%
bs=2k 6,895KB/s 6,519KB/s +5.7%
bs=4k 13,357KB/s 13,264KB/s +0.7%
bs=4k misalign=2k 10,706KB/s 13,132KB/s -18.5%
As you guessed, the performance obviously degrades when blocksize is small
and when I/O is misaligned.
BTW, from the discussion with Christoph I noticed my misunderstanding
about the necessity of additional page locking. It would seem that
without page locking there is no danger of racing between direct I/O and
page migration. So I retract this additional locking patch-set.
Thanks,
Naoya Horiguchi
> When get_user_pages_fast() is called before try_to_unmap(),
> direct I/O code increments refcount on the target page.
> Because this refcount is not associated to the mapping,
> migration code will find remaining refcounts after try_to_unmap()
> unmaps all mappings. Then refcount check decides migration to fail,
> so direct I/O is continued safely.
This would imply that direct IO can make migration fail arbitarily.
Also not good. Should we add some retries, at least for the soft offline
case?
-Andi
--
[email protected] -- Speaking for myself only.
Naoya Horiguchi <[email protected]> writes:
> BTW, from the discussion with Christoph I noticed my misunderstanding
> about the necessity of additional page locking. It would seem that
> without page locking there is no danger of racing between direct I/O and
> page migration. So I retract this additional locking patch-set.
OK, great! ;-)
Cheers,
Jeff
On Tue, Aug 17, 2010 at 09:46:56AM -0400, Jeff Moyer wrote:
> Naoya Horiguchi <[email protected]> writes:
>
> > BTW, from the discussion with Christoph I noticed my misunderstanding
> > about the necessity of additional page locking. It would seem that
> > without page locking there is no danger of racing between direct I/O and
> > page migration. So I retract this additional locking patch-set.
>
> OK, great! ;-)
Well it sounds like we still may need something. It isn't good if O_DIRECT
can starve (or DoS) migration.
-Andi
--
[email protected] -- Speaking for myself only.
On Tue, 17 Aug 2010, Andi Kleen wrote:
> On Tue, Aug 17, 2010 at 09:46:56AM -0400, Jeff Moyer wrote:
> > Naoya Horiguchi <[email protected]> writes:
> >
> > > BTW, from the discussion with Christoph I noticed my misunderstanding
> > > about the necessity of additional page locking. It would seem that
> > > without page locking there is no danger of racing between direct I/O and
> > > page migration. So I retract this additional locking patch-set.
> >
> > OK, great! ;-)
>
> Well it sounds like we still may need something. It isn't good if O_DIRECT
> can starve (or DoS) migration.
Anything that increments a page refcount can interfere with migration and
cause migration to be aborted and retried later. Not something new. There
never was any guarantee that migration must succeed.
On Tue, Aug 10, 2010 at 05:27:36PM +0800, Naoya Horiguchi wrote:
> In order to handle metadatum correctly, we should check whether the hugepage
> we are going to access is HWPOISONed *before* incrementing mapcount,
> adding the hugepage into pagecache or constructing anon_vma.
> This patch also adds retry code when there is a race between
> alloc_huge_page() and memory failure.
This duplicates the PageHWPoison() test into 3 places without really
address any problem. For example, there are still _unavoidable_ races
between PageHWPoison() and add_to_page_cache().
What's the problem you are trying to resolve here? If there are
data structure corruption, we may need to do it in some other ways.
Thanks,
Fengguang
> Signed-off-by: Naoya Horiguchi <[email protected]>
> Signed-off-by: Jun'ichi Nomura <[email protected]>
> ---
> mm/hugetlb.c | 34 +++++++++++++++++++++-------------
> 1 files changed, 21 insertions(+), 13 deletions(-)
>
> diff --git linux-mce-hwpoison/mm/hugetlb.c linux-mce-hwpoison/mm/hugetlb.c
> index a26c24a..5c77a73 100644
> --- linux-mce-hwpoison/mm/hugetlb.c
> +++ linux-mce-hwpoison/mm/hugetlb.c
> @@ -2490,8 +2490,15 @@ retry:
> int err;
> struct inode *inode = mapping->host;
>
> - err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
> + lock_page(page);
> + if (unlikely(PageHWPoison(page))) {
> + unlock_page(page);
> + goto retry;
> + }
> + err = add_to_page_cache_locked(page, mapping,
> + idx, GFP_KERNEL);
> if (err) {
> + unlock_page(page);
> put_page(page);
> if (err == -EEXIST)
> goto retry;
> @@ -2504,6 +2511,10 @@ retry:
> page_dup_rmap(page);
> } else {
> lock_page(page);
> + if (unlikely(PageHWPoison(page))) {
> + unlock_page(page);
> + goto retry;
> + }
> if (unlikely(anon_vma_prepare(vma))) {
> ret = VM_FAULT_OOM;
> goto backout_unlocked;
> @@ -2511,22 +2522,19 @@ retry:
> hugepage_add_new_anon_rmap(page, vma, address);
> }
> } else {
> + /*
> + * If memory error occurs between mmap() and fault, some process
> + * don't have hwpoisoned swap entry for errored virtual address.
> + * So we need to block hugepage fault by PG_hwpoison bit check.
> + */
> + if (unlikely(PageHWPoison(page))) {
> + ret = VM_FAULT_HWPOISON;
> + goto backout_unlocked;
> + }
> page_dup_rmap(page);
> }
>
> /*
> - * Since memory error handler replaces pte into hwpoison swap entry
> - * at the time of error handling, a process which reserved but not have
> - * the mapping to the error hugepage does not have hwpoison swap entry.
> - * So we need to block accesses from such a process by checking
> - * PG_hwpoison bit here.
> - */
> - if (unlikely(PageHWPoison(page))) {
> - ret = VM_FAULT_HWPOISON;
> - goto backout_unlocked;
> - }
> -
> - /*
> * If we are going to COW a private mapping later, we examine the
> * pending reservations for this page now. This will ensure that
> * any allocations necessary to record that reservation occur outside
> --
> 1.7.2.1
Hi,
On Mon, Aug 16, 2010 at 11:51:30PM -0700, David Rientjes wrote:
> On Tue, 10 Aug 2010, Naoya Horiguchi wrote:
...
> > +/*
> > + * This allocation function is useful in the context where vma is irrelevant.
> > + * E.g. soft-offlining uses this function because it only cares physical
> > + * address of error page.
> > + */
> > +struct page *alloc_huge_page_no_vma_node(struct hstate *h, int nid)
> > +{
> > + struct page *page;
> > +
> > + spin_lock(&hugetlb_lock);
> > + get_mems_allowed();
>
> Why is this calling get_mems_allowed()? dequeue_huge_page_node() isn't
> concerned if nid can be allocated by current in this context.
OK, I'll remove this.
> > + page = dequeue_huge_page_node(h, nid);
> > + put_mems_allowed();
> > + spin_unlock(&hugetlb_lock);
> > +
> > + if (!page) {
> > + page = alloc_buddy_huge_page_node(h, nid);
> > + if (!page) {
> > + __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
> > return NULL;
> > - }
> > - prep_new_huge_page(h, page, nid);
> > + } else
> > + __count_vm_event(HTLB_BUDDY_PGALLOC);
> > }
> >
> > + set_page_refcounted(page);
>
> Possibility of NULL pointer dereference?
I think this allocate function returns without calling
set_page_refcounted() if page == NULL. Or do you mean another point?
Thanks,
Naoya Horiguchi
On Tue, Aug 17, 2010 at 11:40:08AM +0200, Andi Kleen wrote:
> > When get_user_pages_fast() is called before try_to_unmap(),
> > direct I/O code increments refcount on the target page.
> > Because this refcount is not associated to the mapping,
> > migration code will find remaining refcounts after try_to_unmap()
> > unmaps all mappings. Then refcount check decides migration to fail,
> > so direct I/O is continued safely.
>
> This would imply that direct IO can make migration fail arbitarily.
> Also not good. Should we add some retries, at least for the soft offline
> case?
Soft offline is kicked from userspace, so the retry logic can be implemented
in userspace. However, currently we can't distinguish migration failure from
"unknown page" error by return value (-EIO in both case for now.)
How about changing return value to -EAGAIN when migration failed?
Thanks,
Naoya Horiguchi
On Wed, Aug 18, 2010 at 04:32:34PM +0900, Naoya Horiguchi wrote:
> On Tue, Aug 17, 2010 at 11:40:08AM +0200, Andi Kleen wrote:
> > > When get_user_pages_fast() is called before try_to_unmap(),
> > > direct I/O code increments refcount on the target page.
> > > Because this refcount is not associated to the mapping,
> > > migration code will find remaining refcounts after try_to_unmap()
> > > unmaps all mappings. Then refcount check decides migration to fail,
> > > so direct I/O is continued safely.
> >
> > This would imply that direct IO can make migration fail arbitarily.
> > Also not good. Should we add some retries, at least for the soft offline
> > case?
>
> Soft offline is kicked from userspace, so the retry logic can be implemented
> in userspace. However, currently we can't distinguish migration failure from
I don't think user space is the right place for retry logic.
It doesn't really have enough information to make a good decision when
to reply.
Also I would consider requiring user space to work around kernel problems like
that bad design.
-Andi
--
[email protected] -- Speaking for myself only.
On Tue, Aug 10, 2010 at 06:27:43PM +0900, Naoya Horiguchi wrote:
> debugfs:hwpoison/renew-pfn is the old interface.
> This patch renames and fixes it.
Thanks for the fix!
Acked-by: Wu Fengguang <[email protected]>
> +void increment_corrupted_huge_page(struct page *page);
> +void decrement_corrupted_huge_page(struct page *page);
nitpick: increment/decrement are not verbs.
> +void increment_corrupted_huge_page(struct page *hpage)
> +{
> + struct hstate *h = page_hstate(hpage);
> + spin_lock(&hugetlb_lock);
> + h->corrupted_huge_pages++;
> + spin_unlock(&hugetlb_lock);
> +}
> +
> +void decrement_corrupted_huge_page(struct page *hpage)
> +{
> + struct hstate *h = page_hstate(hpage);
> + spin_lock(&hugetlb_lock);
> + BUG_ON(!h->corrupted_huge_pages);
There is no point to have BUG_ON() here:
/*
* Don't use BUG() or BUG_ON() unless there's really no way out; one
* example might be detecting data structure corruption in the middle
* of an operation that can't be backed out of. If the (sub)system
* can somehow continue operating, perhaps with reduced functionality,
* it's probably not BUG-worthy.
*
* If you're tempted to BUG(), think again: is completely giving up
* really the *only* solution? There are usually better options, where
* users don't need to reboot ASAP and can mostly shut down cleanly.
*/
And there is a race case that (corrupted_huge_pages==0)!
Suppose the user space calls unpoison_memory() on a good pfn, and the page
happen to be hwpoisoned between lock_page() and TestClearPageHWPoison(),
corrupted_huge_pages will go negative.
Thanks,
Fengguang
> + h->corrupted_huge_pages--;
> + spin_unlock(&hugetlb_lock);
> +}
On Wed, Aug 18, 2010 at 08:18:42AM +0800, Wu Fengguang wrote:
> On Tue, Aug 10, 2010 at 05:27:36PM +0800, Naoya Horiguchi wrote:
> > In order to handle metadatum correctly, we should check whether the hugepage
> > we are going to access is HWPOISONed *before* incrementing mapcount,
> > adding the hugepage into pagecache or constructing anon_vma.
> > This patch also adds retry code when there is a race between
> > alloc_huge_page() and memory failure.
>
> This duplicates the PageHWPoison() test into 3 places without really
> address any problem. For example, there are still _unavoidable_ races
> between PageHWPoison() and add_to_page_cache().
>
> What's the problem you are trying to resolve here? If there are
> data structure corruption, we may need to do it in some other ways.
The problem I tried to resolve in this patch is the corruption of
data structures when memory failure occurs between alloc_huge_page()
and lock_page().
The corruption occurs because page fault can fail with metadata changes
remained (such as refcount, mapcount, etc.)
Since the PageHWPoison() check is for avoiding hwpoisoned page remained
in pagecache mapping to the process, it should be done in
"found in pagecache" branch, not in the common path.
This patch moves the check to "found in pagecache" branch.
In addition to that, I added 2 PageHWPoison checks in "new allocation" branches
to enhance the possiblity to recover from memory failures on pages under allocation.
But it's a different point from the original one, so I drop these retry checks.
Thanks,
Naoya Horiguchi
On Thu, Aug 19, 2010 at 03:55:43PM +0800, Naoya Horiguchi wrote:
> On Wed, Aug 18, 2010 at 08:18:42AM +0800, Wu Fengguang wrote:
> > On Tue, Aug 10, 2010 at 05:27:36PM +0800, Naoya Horiguchi wrote:
> > > In order to handle metadatum correctly, we should check whether the hugepage
> > > we are going to access is HWPOISONed *before* incrementing mapcount,
> > > adding the hugepage into pagecache or constructing anon_vma.
> > > This patch also adds retry code when there is a race between
> > > alloc_huge_page() and memory failure.
> >
> > This duplicates the PageHWPoison() test into 3 places without really
> > address any problem. For example, there are still _unavoidable_ races
> > between PageHWPoison() and add_to_page_cache().
> >
> > What's the problem you are trying to resolve here? If there are
> > data structure corruption, we may need to do it in some other ways.
>
> The problem I tried to resolve in this patch is the corruption of
> data structures when memory failure occurs between alloc_huge_page()
> and lock_page().
> The corruption occurs because page fault can fail with metadata changes
> remained (such as refcount, mapcount, etc.)
> Since the PageHWPoison() check is for avoiding hwpoisoned page remained
> in pagecache mapping to the process, it should be done in
> "found in pagecache" branch, not in the common path.
> This patch moves the check to "found in pagecache" branch.
That's good stuff to put in the changelog.
> In addition to that, I added 2 PageHWPoison checks in "new allocation" branches
> to enhance the possiblity to recover from memory failures on pages under allocation.
> But it's a different point from the original one, so I drop these retry checks.
So you'll remove the first two chunks and retain the 3rd chunk?
That makes it a small bug-fix patch suitable for 2.6.36 and I'll
happily ACK it :)
Thanks,
Fengguang
On Thu, Aug 19, 2010 at 05:28:28PM +0800, Wu Fengguang wrote:
> On Thu, Aug 19, 2010 at 03:55:43PM +0800, Naoya Horiguchi wrote:
> > On Wed, Aug 18, 2010 at 08:18:42AM +0800, Wu Fengguang wrote:
> > > On Tue, Aug 10, 2010 at 05:27:36PM +0800, Naoya Horiguchi wrote:
> > > > In order to handle metadatum correctly, we should check whether the hugepage
> > > > we are going to access is HWPOISONed *before* incrementing mapcount,
> > > > adding the hugepage into pagecache or constructing anon_vma.
> > > > This patch also adds retry code when there is a race between
> > > > alloc_huge_page() and memory failure.
> > >
> > > This duplicates the PageHWPoison() test into 3 places without really
> > > address any problem. For example, there are still _unavoidable_ races
> > > between PageHWPoison() and add_to_page_cache().
> > >
> > > What's the problem you are trying to resolve here? If there are
> > > data structure corruption, we may need to do it in some other ways.
> >
> > The problem I tried to resolve in this patch is the corruption of
> > data structures when memory failure occurs between alloc_huge_page()
> > and lock_page().
> > The corruption occurs because page fault can fail with metadata changes
> > remained (such as refcount, mapcount, etc.)
> > Since the PageHWPoison() check is for avoiding hwpoisoned page remained
> > in pagecache mapping to the process, it should be done in
> > "found in pagecache" branch, not in the common path.
> > This patch moves the check to "found in pagecache" branch.
>
> That's good stuff to put in the changelog.
OK.
> > In addition to that, I added 2 PageHWPoison checks in "new allocation" branches
> > to enhance the possiblity to recover from memory failures on pages under allocation.
> > But it's a different point from the original one, so I drop these retry checks.
>
> So you'll remove the first two chunks and retain the 3rd chunk?
Yes.
> That makes it a small bug-fix patch suitable for 2.6.36 and I'll
> happily ACK it :)
Thank you!
Naoya Horiguchi
On Thu, Aug 19, 2010 at 09:57:52AM +0800, Wu Fengguang wrote:
> > +void increment_corrupted_huge_page(struct page *page);
> > +void decrement_corrupted_huge_page(struct page *page);
>
> nitpick: increment/decrement are not verbs.
OK, increase/decrease are correct.
> > +void increment_corrupted_huge_page(struct page *hpage)
> > +{
> > + struct hstate *h = page_hstate(hpage);
> > + spin_lock(&hugetlb_lock);
> > + h->corrupted_huge_pages++;
> > + spin_unlock(&hugetlb_lock);
> > +}
> > +
> > +void decrement_corrupted_huge_page(struct page *hpage)
> > +{
> > + struct hstate *h = page_hstate(hpage);
> > + spin_lock(&hugetlb_lock);
> > + BUG_ON(!h->corrupted_huge_pages);
>
> There is no point to have BUG_ON() here:
>
> /*
> * Don't use BUG() or BUG_ON() unless there's really no way out; one
> * example might be detecting data structure corruption in the middle
> * of an operation that can't be backed out of. If the (sub)system
> * can somehow continue operating, perhaps with reduced functionality,
> * it's probably not BUG-worthy.
> *
> * If you're tempted to BUG(), think again: is completely giving up
> * really the *only* solution? There are usually better options, where
> * users don't need to reboot ASAP and can mostly shut down cleanly.
> */
OK. I understand.
BUG_ON() is too severe for just a counter.
>
> And there is a race case that (corrupted_huge_pages==0)!
> Suppose the user space calls unpoison_memory() on a good pfn, and the page
> happen to be hwpoisoned between lock_page() and TestClearPageHWPoison(),
> corrupted_huge_pages will go negative.
I see.
When this race happens, unpoison runs and decreases HugePages_Crpt,
but racing memory failure returns without increasing it.
Yes, this is a problem we need to fix.
Moreover for hugepage we should pay attention to the possiblity of
mce_bad_pages mismatch which can occur by race between unpoison and
multiple memory failures, where each failure increases mce_bad_pages
by the number of pages in a hugepage.
I think counting corrupted hugepages is not directly related to
hugepage migration, and this problem only affects the counter,
not other behaviors, so I'll separate hugepage counter fix patch
from this patch set and post as another patch series. Is this OK?
Thanks,
Naoya Horiguchi
On Tue, Aug 24, 2010 at 11:01:33AM +0800, Naoya Horiguchi wrote:
> On Thu, Aug 19, 2010 at 09:57:52AM +0800, Wu Fengguang wrote:
> > > +void increment_corrupted_huge_page(struct page *page);
> > > +void decrement_corrupted_huge_page(struct page *page);
> >
> > nitpick: increment/decrement are not verbs.
>
> OK, increase/decrease are correct.
>
>
> > > +void increment_corrupted_huge_page(struct page *hpage)
> > > +{
> > > + struct hstate *h = page_hstate(hpage);
> > > + spin_lock(&hugetlb_lock);
> > > + h->corrupted_huge_pages++;
> > > + spin_unlock(&hugetlb_lock);
> > > +}
> > > +
> > > +void decrement_corrupted_huge_page(struct page *hpage)
> > > +{
> > > + struct hstate *h = page_hstate(hpage);
> > > + spin_lock(&hugetlb_lock);
> > > + BUG_ON(!h->corrupted_huge_pages);
> >
> > There is no point to have BUG_ON() here:
> >
> > /*
> > * Don't use BUG() or BUG_ON() unless there's really no way out; one
> > * example might be detecting data structure corruption in the middle
> > * of an operation that can't be backed out of. If the (sub)system
> > * can somehow continue operating, perhaps with reduced functionality,
> > * it's probably not BUG-worthy.
> > *
> > * If you're tempted to BUG(), think again: is completely giving up
> > * really the *only* solution? There are usually better options, where
> > * users don't need to reboot ASAP and can mostly shut down cleanly.
> > */
>
> OK. I understand.
> BUG_ON() is too severe for just a counter.
>
> >
> > And there is a race case that (corrupted_huge_pages==0)!
> > Suppose the user space calls unpoison_memory() on a good pfn, and the page
> > happen to be hwpoisoned between lock_page() and TestClearPageHWPoison(),
> > corrupted_huge_pages will go negative.
>
> I see.
> When this race happens, unpoison runs and decreases HugePages_Crpt,
> but racing memory failure returns without increasing it.
> Yes, this is a problem we need to fix.
>
> Moreover for hugepage we should pay attention to the possiblity of
> mce_bad_pages mismatch which can occur by race between unpoison and
> multiple memory failures, where each failure increases mce_bad_pages
> by the number of pages in a hugepage.
Yup.
> I think counting corrupted hugepages is not directly related to
> hugepage migration, and this problem only affects the counter,
> not other behaviors, so I'll separate hugepage counter fix patch
> from this patch set and post as another patch series. Is this OK?
That would be better, thanks.
Thanks,
Fengguang