2017-12-04 14:01:29

by Michal Hocko

[permalink] [raw]
Subject: [RFC PATCH 0/5] mm, hugetlb: allocation API and migration improvements

Hi,
this is a follow up for [1] for the allocation API and [2] for the
hugetlb migration. It wasn't really easy to split those into two
separate patch series as they share some code.

My primary motivation to touch this code is to make the gigantic pages
migration working. The giga pages allocation code is just too fragile
and hacked into the hugetlb code now. This series tries to move giga
pages closer to the first class citizen. We are not there yet but having
5 patches is quite a lot already and it will already make the code much
easier to follow. I will come with other changes on top after this sees
some review.

The first two patches should be trivial to review. The third patch
changes the way how we migrate huge pages. Newly allocated pages are a
subject of the overcommit check and they participate surplus accounting
which is quite unfortunate as the changelog explains. This patch doesn't
change anything wrt. giga pages.
Patch #4 removes the surplus accounting hack from
__alloc_surplus_huge_page. I hope I didn't miss anything there and a
deeper review is really due there.
Patch #5 finally unifies allocation paths and giga pages shouldn't be
any special anymore. There is also some renaming going on as well.

Shortlog
Michal Hocko (5):
mm, hugetlb: unify core page allocation accounting and initialization
mm, hugetlb: integrate giga hugetlb more naturally to the allocation path
mm, hugetlb: do not rely on overcommit limit during migration
mm, hugetlb: get rid of surplus page accounting tricks
mm, hugetlb: further simplify hugetlb allocation API

Diffstat:
include/linux/hugetlb.h | 3 +
mm/hugetlb.c | 305 +++++++++++++++++++++++++++---------------------
mm/migrate.c | 3 +-
3 files changed, 175 insertions(+), 136 deletions(-)


[1] http://lkml.kernel.org/r/[email protected]
[2] http://lkml.kernel.org/r/[email protected]


2017-12-04 14:01:38

by Michal Hocko

[permalink] [raw]
Subject: [RFC PATCH 2/5] mm, hugetlb: integrate giga hugetlb more naturally to the allocation path

From: Michal Hocko <[email protected]>

Gigantic hugetlb pages were ingrown to the hugetlb code as an alien
specie with a lot of special casing. The allocation path is not an
exception. Unnecessarily so to be honest. It is true that the underlying
allocator is different but that is an implementation detail.

This patch unifies the hugetlb allocation path that a prepares fresh
pool pages. alloc_fresh_gigantic_page basically copies alloc_fresh_huge_page
logic so we can move everything there. This will simplify set_max_huge_pages
which doesn't have to care about what kind of huge page we allocate.

Signed-off-by: Michal Hocko <[email protected]>
---
mm/hugetlb.c | 53 ++++++++++++-----------------------------------------
1 file changed, 12 insertions(+), 41 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8189c92fac82..ac105fb32620 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1106,7 +1106,8 @@ static bool zone_spans_last_pfn(const struct zone *zone,
return zone_spans_pfn(zone, last_pfn);
}

-static struct page *alloc_gigantic_page(int nid, struct hstate *h)
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
{
unsigned int order = huge_page_order(h);
unsigned long nr_pages = 1 << order;
@@ -1114,11 +1115,9 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h)
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
- gfp_t gfp_mask;

- gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
zonelist = node_zonelist(nid, gfp_mask);
- for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) {
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
spin_lock_irqsave(&zone->lock, flags);

pfn = ALIGN(zone->zone_start_pfn, nr_pages);
@@ -1149,42 +1148,11 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h)
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
static void prep_compound_gigantic_page(struct page *page, unsigned int order);

-static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
-{
- struct page *page;
-
- page = alloc_gigantic_page(nid, h);
- if (page) {
- prep_compound_gigantic_page(page, huge_page_order(h));
- prep_new_huge_page(h, page, nid);
- put_page(page); /* free it into the hugepage allocator */
- }
-
- return page;
-}
-
-static int alloc_fresh_gigantic_page(struct hstate *h,
- nodemask_t *nodes_allowed)
-{
- struct page *page = NULL;
- int nr_nodes, node;
-
- for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- page = alloc_fresh_gigantic_page_node(h, node);
- if (page)
- return 1;
- }
-
- return 0;
-}
-
#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
static inline bool gigantic_page_supported(void) { return false; }
static inline void free_gigantic_page(struct page *page, unsigned int order) { }
static inline void destroy_compound_gigantic_page(struct page *page,
unsigned int order) { }
-static inline int alloc_fresh_gigantic_page(struct hstate *h,
- nodemask_t *nodes_allowed) { return 0; }
#endif

static void update_and_free_page(struct hstate *h, struct page *page)
@@ -1410,8 +1378,12 @@ static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;

for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask,
- node, nodes_allowed);
+ if (hstate_is_gigantic(h))
+ page = alloc_gigantic_page(h, gfp_mask,
+ node, nodes_allowed);
+ else
+ page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask,
+ node, nodes_allowed);
if (page)
break;

@@ -1420,6 +1392,8 @@ static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
if (!page)
return 0;

+ if (hstate_is_gigantic(h))
+ prep_compound_gigantic_page(page, huge_page_order(h));
prep_new_huge_page(h, page, page_to_nid(page));
put_page(page); /* free it into the hugepage allocator */

@@ -2307,10 +2281,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
/* yield cpu to avoid soft lockup */
cond_resched();

- if (hstate_is_gigantic(h))
- ret = alloc_fresh_gigantic_page(h, nodes_allowed);
- else
- ret = alloc_fresh_huge_page(h, nodes_allowed);
+ ret = alloc_fresh_huge_page(h, nodes_allowed);
spin_lock(&hugetlb_lock);
if (!ret)
goto out;
--
2.15.0

2017-12-04 14:01:45

by Michal Hocko

[permalink] [raw]
Subject: [RFC PATCH 3/5] mm, hugetlb: do not rely on overcommit limit during migration

From: Michal Hocko <[email protected]>

hugepage migration relies on __alloc_buddy_huge_page to get a new page.
This has 2 main disadvantages.
1) it doesn't allow to migrate any huge page if the pool is used
completely which is not an exceptional case as the pool is static and
unused memory is just wasted.
2) it leads to a weird semantic when migration between two numa nodes
might increase the pool size of the destination NUMA node while the page
is in use. The issue is caused by per NUMA node surplus pages tracking
(see free_huge_page).

Address both issues by changing the way how we allocate and account
pages allocated for migration. Those should temporal by definition.
So we mark them that way (we will abuse page flags in the 3rd page)
and update free_huge_page to free such pages to the page allocator.
Page migration path then just transfers the temporal status from the
new page to the old one which will be freed on the last reference.
The global surplus count will never change during this path but we still
have to be careful when migrating a per-node suprlus page. This is now
handled in move_hugetlb_state which is called from the migration path
and it copies the hugetlb specific page state and fixes up the
accounting when needed

Rename __alloc_buddy_huge_page to __alloc_surplus_huge_page to better
reflect its purpose. The new allocation routine for the migration path
is __alloc_migrate_huge_page.

The user visible effect of this patch is that migrated pages are really
temporal and they travel between NUMA nodes as per the migration
request:
Before migration
/sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages:0
/sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages:1
/sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages:0
/sys/devices/system/node/node1/hugepages/hugepages-2048kB/free_hugepages:0
/sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages:0
/sys/devices/system/node/node1/hugepages/hugepages-2048kB/surplus_hugepages:0

After

/sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages:0
/sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages:0
/sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages:0
/sys/devices/system/node/node1/hugepages/hugepages-2048kB/free_hugepages:0
/sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages:1
/sys/devices/system/node/node1/hugepages/hugepages-2048kB/surplus_hugepages:0

with the previous implementation, both nodes would have nr_hugepages:1
until the page is freed.

Signed-off-by: Michal Hocko <[email protected]>
---
include/linux/hugetlb.h | 3 ++
mm/hugetlb.c | 111 +++++++++++++++++++++++++++++++++++++++++-------
mm/migrate.c | 3 +-
3 files changed, 99 insertions(+), 18 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6e3696c7b35a..1a9c89850e4a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -119,6 +119,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
long freed);
bool isolate_huge_page(struct page *page, struct list_head *list);
void putback_active_hugepage(struct page *page);
+void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
void free_huge_page(struct page *page);
void hugetlb_fix_reserve_counts(struct inode *inode);
extern struct mutex *hugetlb_fault_mutex_table;
@@ -157,6 +158,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot);

bool is_hugetlb_entry_migration(pte_t pte);
+
#else /* !CONFIG_HUGETLB_PAGE */

static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
@@ -197,6 +199,7 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list)
return false;
}
#define putback_active_hugepage(p) do {} while (0)
+#define move_hugetlb_state(old, new, reason) do {} while (0)

static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ac105fb32620..a1b8b2888ec9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -34,6 +34,7 @@
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
#include <linux/userfaultfd_k.h>
+#include <linux/page_owner.h>
#include "internal.h"

int hugetlb_max_hstate __read_mostly;
@@ -1217,6 +1218,28 @@ static void clear_page_huge_active(struct page *page)
ClearPagePrivate(&page[1]);
}

+/*
+ * Internal hugetlb specific page flag. Do not use outside of the hugetlb
+ * code
+ */
+static inline bool PageHugeTemporary(struct page *page)
+{
+ if (!PageHuge(page))
+ return false;
+
+ return (unsigned long)page[2].mapping == -1U;
+}
+
+static inline void SetPageHugeTemporary(struct page *page)
+{
+ page[2].mapping = (void *)-1U;
+}
+
+static inline void ClearPageHugeTemporary(struct page *page)
+{
+ page[2].mapping = NULL;
+}
+
void free_huge_page(struct page *page)
{
/*
@@ -1251,7 +1274,11 @@ void free_huge_page(struct page *page)
if (restore_reserve)
h->resv_huge_pages++;

- if (h->surplus_huge_pages_node[nid]) {
+ if (PageHugeTemporary(page)) {
+ list_del(&page->lru);
+ ClearPageHugeTemporary(page);
+ update_and_free_page(h, page);
+ } else if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
list_del(&page->lru);
update_and_free_page(h, page);
@@ -1505,7 +1532,10 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
return rc;
}

-static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
+/*
+ * Allocates a fresh surplus page from the page allocator.
+ */
+static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
struct page *page;
@@ -1569,6 +1599,28 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
return page;
}

+static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nmask)
+{
+ struct page *page;
+
+ if (hstate_is_gigantic(h))
+ return NULL;
+
+ page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
+ if (!page)
+ return NULL;
+
+ /*
+ * We do not account these pages as surplus because they are only
+ * temporary and will be released properly on the last reference
+ */
+ prep_new_huge_page(h, page, page_to_nid(page));
+ SetPageHugeTemporary(page);
+
+ return page;
+}
+
/*
* Use the VMA's mpolicy to allocate a huge page from the buddy.
*/
@@ -1583,17 +1635,13 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
nodemask_t *nodemask;

nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
- page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask);
+ page = __alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
mpol_cond_put(mpol);

return page;
}

-/*
- * This allocation function is useful in the context where vma is irrelevant.
- * E.g. soft-offlining uses this function because it only cares physical
- * address of error page.
- */
+/* page migration callback function */
struct page *alloc_huge_page_node(struct hstate *h, int nid)
{
gfp_t gfp_mask = htlb_alloc_mask(h);
@@ -1608,12 +1656,12 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
spin_unlock(&hugetlb_lock);

if (!page)
- page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL);
+ page = __alloc_migrate_huge_page(h, gfp_mask, nid, NULL);

return page;
}

-
+/* page migration callback function */
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask)
{
@@ -1631,9 +1679,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
}
spin_unlock(&hugetlb_lock);

- /* No reservations, try to overcommit */
-
- return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask);
+ return __alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
}

/*
@@ -1661,7 +1707,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
retry:
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
- page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h),
+ page = __alloc_surplus_huge_page(h, htlb_alloc_mask(h),
NUMA_NO_NODE, NULL);
if (!page) {
alloc_ok = false;
@@ -2258,7 +2304,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
* First take pages out of surplus state. Then make up the
* remaining difference by allocating fresh huge pages.
*
- * We might race with __alloc_buddy_huge_page() here and be unable
+ * We might race with __alloc_surplus_huge_page() here and be unable
* to convert a surplus huge page to a normal huge page. That is
* not critical, though, it just means the overall size of the
* pool might be one hugepage larger than it needs to be, but
@@ -2301,7 +2347,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
* By placing pages into the surplus state independent of the
* overcommit value, we are allowing the surplus pool size to
* exceed overcommit. There are few sane options here. Since
- * __alloc_buddy_huge_page() is checking the global counter,
+ * __alloc_surplus_huge_page() is checking the global counter,
* though, we'll note that we're not allowed to exceed surplus
* and won't grow the pool anywhere else. Not until one of the
* sysctls are changed, or the surplus pages go out of use.
@@ -4775,3 +4821,36 @@ void putback_active_hugepage(struct page *page)
spin_unlock(&hugetlb_lock);
put_page(page);
}
+
+void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
+{
+ struct hstate *h = page_hstate(oldpage);
+
+ hugetlb_cgroup_migrate(oldpage, newpage);
+ set_page_owner_migrate_reason(newpage, reason);
+
+ /*
+ * transfer temporary state of the new huge page. This is
+ * reverse to other transitions because the newpage is going to
+ * be final while the old one will be freed so it takes over
+ * the temporary status.
+ *
+ * Also note that we have to transfer the per-node surplus state
+ * here as well otherwise the global surplus count will not match
+ * the per-node's.
+ */
+ if (PageHugeTemporary(newpage)) {
+ int old_nid = page_to_nid(oldpage);
+ int new_nid = page_to_nid(newpage);
+
+ SetPageHugeTemporary(oldpage);
+ ClearPageHugeTemporary(newpage);
+
+ spin_lock(&hugetlb_lock);
+ if (h->surplus_huge_pages_node[old_nid]) {
+ h->surplus_huge_pages_node[old_nid]--;
+ h->surplus_huge_pages_node[new_nid]++;
+ }
+ spin_unlock(&hugetlb_lock);
+ }
+}
diff --git a/mm/migrate.c b/mm/migrate.c
index 4d0be47a322a..1e5525a25691 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1323,9 +1323,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
put_anon_vma(anon_vma);

if (rc == MIGRATEPAGE_SUCCESS) {
- hugetlb_cgroup_migrate(hpage, new_hpage);
+ move_hugetlb_state(hpage, new_hpage, reason);
put_new_page = NULL;
- set_page_owner_migrate_reason(new_hpage, reason);
}

unlock_page(hpage);
--
2.15.0

2017-12-04 14:01:41

by Michal Hocko

[permalink] [raw]
Subject: [RFC PATCH 1/5] mm, hugetlb: unify core page allocation accounting and initialization

From: Michal Hocko <[email protected]>

hugetlb allocator has two entry points to the page allocator
- alloc_fresh_huge_page_node
- __hugetlb_alloc_buddy_huge_page

The two differ very subtly in two aspects. The first one doesn't care
about HTLB_BUDDY_* stats and it doesn't initialize the huge page.
prep_new_huge_page is not used because it not only initializes hugetlb
specific stuff but because it also put_page and releases the page to
the hugetlb pool which is not what is required in some contexts. This
makes things more complicated than necessary.

Simplify things by a) removing the page allocator entry point duplicity
and only keep __hugetlb_alloc_buddy_huge_page and b) make
prep_new_huge_page more reusable by removing the put_page which moves
the page to the allocator pool. All current callers are updated to call
put_page explicitly. Later patches will add new callers which won't
need it.

This patch shouldn't introduce any functional change.

Signed-off-by: Michal Hocko <[email protected]>
---
mm/hugetlb.c | 61 +++++++++++++++++++++++++++++-------------------------------
1 file changed, 29 insertions(+), 32 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2c9033d39bfe..8189c92fac82 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1157,6 +1157,7 @@ static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
if (page) {
prep_compound_gigantic_page(page, huge_page_order(h));
prep_new_huge_page(h, page, nid);
+ put_page(page); /* free it into the hugepage allocator */
}

return page;
@@ -1304,7 +1305,6 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
spin_unlock(&hugetlb_lock);
- put_page(page); /* free it into the hugepage allocator */
}

static void prep_compound_gigantic_page(struct page *page, unsigned int order)
@@ -1381,41 +1381,49 @@ pgoff_t __basepage_index(struct page *page)
return (index << compound_order(page_head)) + compound_idx;
}

-static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
+static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
+ gfp_t gfp_mask, int nid, nodemask_t *nmask)
{
+ int order = huge_page_order(h);
struct page *page;

- page = __alloc_pages_node(nid,
- htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
- __GFP_RETRY_MAYFAIL|__GFP_NOWARN,
- huge_page_order(h));
- if (page) {
- prep_new_huge_page(h, page, nid);
- }
+ gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
+ if (nid == NUMA_NO_NODE)
+ nid = numa_mem_id();
+ page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
+ if (page)
+ __count_vm_event(HTLB_BUDDY_PGALLOC);
+ else
+ __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);

return page;
}

+/*
+ * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
+ * manner.
+ */
static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
{
struct page *page;
int nr_nodes, node;
- int ret = 0;
+ gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;

for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- page = alloc_fresh_huge_page_node(h, node);
- if (page) {
- ret = 1;
+ page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask,
+ node, nodes_allowed);
+ if (page)
break;
- }
+
}

- if (ret)
- count_vm_event(HTLB_BUDDY_PGALLOC);
- else
- count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+ if (!page)
+ return 0;

- return ret;
+ prep_new_huge_page(h, page, page_to_nid(page));
+ put_page(page); /* free it into the hugepage allocator */
+
+ return 1;
}

/*
@@ -1523,17 +1531,6 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
return rc;
}

-static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
- gfp_t gfp_mask, int nid, nodemask_t *nmask)
-{
- int order = huge_page_order(h);
-
- gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
- if (nid == NUMA_NO_NODE)
- nid = numa_mem_id();
- return __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
-}
-
static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
@@ -1589,11 +1586,9 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
*/
h->nr_huge_pages_node[r_nid]++;
h->surplus_huge_pages_node[r_nid]++;
- __count_vm_event(HTLB_BUDDY_PGALLOC);
} else {
h->nr_huge_pages--;
h->surplus_huge_pages--;
- __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
}
spin_unlock(&hugetlb_lock);

@@ -2148,6 +2143,8 @@ static void __init gather_bootmem_prealloc(void)
prep_compound_huge_page(page, h->order);
WARN_ON(PageReserved(page));
prep_new_huge_page(h, page, page_to_nid(page));
+ put_page(page); /* free it into the hugepage allocator */
+
/*
* If we had gigantic hugepages allocated at boot time, we need
* to restore the 'stolen' pages to totalram_pages in order to
--
2.15.0

2017-12-04 14:02:22

by Michal Hocko

[permalink] [raw]
Subject: [RFC PATCH 5/5] mm, hugetlb: further simplify hugetlb allocation API

From: Michal Hocko <[email protected]>

Hugetlb allocator has several layer of allocation functions depending
and the purpose of the allocation. There are two allocators depending
on whether the page can be allocated from the page allocator or we need
a contiguous allocator. This is currently opencoded in alloc_fresh_huge_page
which is the only path that might allocate giga pages which require the
later allocator. Create alloc_fresh_huge_page which hides this
implementation detail and use it in all callers which hardcoded the
buddy allocator path (__hugetlb_alloc_buddy_huge_page). This shouldn't
introduce any funtional change because both migration and surplus
allocators exlude giga pages explicitly.

While we are at it let's do some renaming. The current scheme is not
consistent and overly painfull to read and understand. Get rid of prefix
underscores from most functions. There is no real reason to make names
longer.
* alloc_fresh_huge_page is the new layer to abstract underlying
allocator
* __hugetlb_alloc_buddy_huge_page becomes shorter and neater
alloc_buddy_huge_page.
* Former alloc_fresh_huge_page becomes alloc_pool_huge_page because we put
the new page directly to the pool
* alloc_surplus_huge_page can drop the opencoded prep_new_huge_page code
as it uses alloc_fresh_huge_page now
* others lose their excessive prefix underscores to make names shorter

Signed-off-by: Michal Hocko <[email protected]>
---
mm/hugetlb.c | 74 +++++++++++++++++++++++++++++++++---------------------------
1 file changed, 41 insertions(+), 33 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0c7dc269b6c0..73c74851a304 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1376,7 +1376,7 @@ pgoff_t __basepage_index(struct page *page)
return (index << compound_order(page_head)) + compound_idx;
}

-static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
+static struct page *alloc_buddy_huge_page(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask)
{
int order = huge_page_order(h);
@@ -1394,34 +1394,49 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
return page;
}

+/*
+ * Common helper to allocate a fresh hugetlb page. All specific allocators
+ * should use this function to get new hugetlb pages
+ */
+static struct page *alloc_fresh_huge_page(struct hstate *h,
+ gfp_t gfp_mask, int nid, nodemask_t *nmask)
+{
+ struct page *page;
+
+ if (hstate_is_gigantic(h))
+ page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
+ else
+ page = alloc_buddy_huge_page(h, gfp_mask,
+ nid, nmask);
+ if (!page)
+ return NULL;
+
+ if (hstate_is_gigantic(h))
+ prep_compound_gigantic_page(page, huge_page_order(h));
+ prep_new_huge_page(h, page, page_to_nid(page));
+
+ return page;
+}
+
/*
* Allocates a fresh page to the hugetlb allocator pool in the node interleaved
* manner.
*/
-static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
{
struct page *page;
int nr_nodes, node;
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;

for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- if (hstate_is_gigantic(h))
- page = alloc_gigantic_page(h, gfp_mask,
- node, nodes_allowed);
- else
- page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask,
- node, nodes_allowed);
+ page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed);
if (page)
break;
-
}

if (!page)
return 0;

- if (hstate_is_gigantic(h))
- prep_compound_gigantic_page(page, huge_page_order(h));
- prep_new_huge_page(h, page, page_to_nid(page));
put_page(page); /* free it into the hugepage allocator */

return 1;
@@ -1535,7 +1550,7 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
/*
* Allocates a fresh surplus page from the page allocator.
*/
-static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
+static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
struct page *page = NULL;
@@ -1548,7 +1563,7 @@ static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
goto out_unlock;
spin_unlock(&hugetlb_lock);

- page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
if (!page)
goto out_unlock;

@@ -1567,12 +1582,6 @@ static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
} else {
h->surplus_huge_pages_node[page_to_nid(page)]++;
h->surplus_huge_pages++;
- INIT_LIST_HEAD(&page->lru);
- r_nid = page_to_nid(page);
- set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
- set_hugetlb_cgroup(page, NULL);
- h->nr_huge_pages_node[r_nid]++;
- h->surplus_huge_pages_node[r_nid]++;
}

out_unlock:
@@ -1581,7 +1590,7 @@ static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
return page;
}

-static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
struct page *page;
@@ -1589,7 +1598,7 @@ static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
if (hstate_is_gigantic(h))
return NULL;

- page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
if (!page)
return NULL;

@@ -1597,7 +1606,6 @@ static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
* We do not account these pages as surplus because they are only
* temporary and will be released properly on the last reference
*/
- prep_new_huge_page(h, page, page_to_nid(page));
SetPageHugeTemporary(page);

return page;
@@ -1607,7 +1615,7 @@ static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
* Use the VMA's mpolicy to allocate a huge page from the buddy.
*/
static
-struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
+struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
struct page *page;
@@ -1617,7 +1625,7 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
nodemask_t *nodemask;

nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
- page = __alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
+ page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
mpol_cond_put(mpol);

return page;
@@ -1638,7 +1646,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
spin_unlock(&hugetlb_lock);

if (!page)
- page = __alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
+ page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);

return page;
}
@@ -1661,7 +1669,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
}
spin_unlock(&hugetlb_lock);

- return __alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
+ return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
}

/*
@@ -1689,7 +1697,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
retry:
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
- page = __alloc_surplus_huge_page(h, htlb_alloc_mask(h),
+ page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
NUMA_NO_NODE, NULL);
if (!page) {
alloc_ok = false;
@@ -2026,7 +2034,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
if (!page) {
spin_unlock(&hugetlb_lock);
- page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
+ page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
if (!page)
goto out_uncharge_cgroup;
if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
@@ -2166,7 +2174,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
if (hstate_is_gigantic(h)) {
if (!alloc_bootmem_huge_page(h))
break;
- } else if (!alloc_fresh_huge_page(h,
+ } else if (!alloc_pool_huge_page(h,
&node_states[N_MEMORY]))
break;
cond_resched();
@@ -2286,7 +2294,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
* First take pages out of surplus state. Then make up the
* remaining difference by allocating fresh huge pages.
*
- * We might race with __alloc_surplus_huge_page() here and be unable
+ * We might race with alloc_surplus_huge_page() here and be unable
* to convert a surplus huge page to a normal huge page. That is
* not critical, though, it just means the overall size of the
* pool might be one hugepage larger than it needs to be, but
@@ -2309,7 +2317,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
/* yield cpu to avoid soft lockup */
cond_resched();

- ret = alloc_fresh_huge_page(h, nodes_allowed);
+ ret = alloc_pool_huge_page(h, nodes_allowed);
spin_lock(&hugetlb_lock);
if (!ret)
goto out;
@@ -2329,7 +2337,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
* By placing pages into the surplus state independent of the
* overcommit value, we are allowing the surplus pool size to
* exceed overcommit. There are few sane options here. Since
- * __alloc_surplus_huge_page() is checking the global counter,
+ * alloc_surplus_huge_page() is checking the global counter,
* though, we'll note that we're not allowed to exceed surplus
* and won't grow the pool anywhere else. Not until one of the
* sysctls are changed, or the surplus pages go out of use.
--
2.15.0

2017-12-04 14:02:45

by Michal Hocko

[permalink] [raw]
Subject: [RFC PATCH 4/5] mm, hugetlb: get rid of surplus page accounting tricks

From: Michal Hocko <[email protected]>

alloc_surplus_huge_page increases the pool size and the number of
surplus pages opportunistically to prevent from races with the pool size
change. See d1c3fb1f8f29 ("hugetlb: introduce nr_overcommit_hugepages
sysctl") for more details.

The resulting code is unnecessarily hairy, cause code duplication and
doesn't allow to share the allocation paths. Moreover pool size changes
tend to be very seldom so optimizing for them is not really reasonable.
Simplify the code and allow to allocate a fresh surplus page as long as
we are under the overcommit limit and then recheck the condition after
the allocation and drop the new page if the situation has changed. This
should provide a reasonable guarantee that an abrupt allocation requests
will not go way off the limit.

If we consider races with the pool shrinking and enlarging then we
should be reasonably safe as well. In the first case we are off by one
in the worst case and the second case should work OK because the page is
not yet visible. We can waste CPU cycles for the allocation but that
should be acceptable for a relatively rare condition.

Signed-off-by: Michal Hocko <[email protected]>
---
mm/hugetlb.c | 60 +++++++++++++++++++++---------------------------------------
1 file changed, 21 insertions(+), 39 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a1b8b2888ec9..0c7dc269b6c0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1538,62 +1538,44 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
- struct page *page;
- unsigned int r_nid;
+ struct page *page = NULL;

if (hstate_is_gigantic(h))
return NULL;

- /*
- * Assume we will successfully allocate the surplus page to
- * prevent racing processes from causing the surplus to exceed
- * overcommit
- *
- * This however introduces a different race, where a process B
- * tries to grow the static hugepage pool while alloc_pages() is
- * called by process A. B will only examine the per-node
- * counters in determining if surplus huge pages can be
- * converted to normal huge pages in adjust_pool_surplus(). A
- * won't be able to increment the per-node counter, until the
- * lock is dropped by B, but B doesn't drop hugetlb_lock until
- * no more huge pages can be converted from surplus to normal
- * state (and doesn't try to convert again). Thus, we have a
- * case where a surplus huge page exists, the pool is grown, and
- * the surplus huge page still exists after, even though it
- * should just have been converted to a normal huge page. This
- * does not leak memory, though, as the hugepage will be freed
- * once it is out of use. It also does not allow the counters to
- * go out of whack in adjust_pool_surplus() as we don't modify
- * the node values until we've gotten the hugepage and only the
- * per-node value is checked there.
- */
spin_lock(&hugetlb_lock);
- if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
- spin_unlock(&hugetlb_lock);
- return NULL;
- } else {
- h->nr_huge_pages++;
- h->surplus_huge_pages++;
- }
+ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
+ goto out_unlock;
spin_unlock(&hugetlb_lock);

page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
+ if (!page)
+ goto out_unlock;

spin_lock(&hugetlb_lock);
- if (page) {
+ /*
+ * We could have raced with the pool size change.
+ * Double check that and simply deallocate the new page
+ * if we would end up overcommiting the surpluses. Abuse
+ * temporary page to workaround the nasty free_huge_page
+ * codeflow
+ */
+ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
+ SetPageHugeTemporary(page);
+ put_page(page);
+ page = NULL;
+ } else {
+ h->surplus_huge_pages_node[page_to_nid(page)]++;
+ h->surplus_huge_pages++;
INIT_LIST_HEAD(&page->lru);
r_nid = page_to_nid(page);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
set_hugetlb_cgroup(page, NULL);
- /*
- * We incremented the global counters already
- */
h->nr_huge_pages_node[r_nid]++;
h->surplus_huge_pages_node[r_nid]++;
- } else {
- h->nr_huge_pages--;
- h->surplus_huge_pages--;
}
+
+out_unlock:
spin_unlock(&hugetlb_lock);

return page;
--
2.15.0

2017-12-13 00:26:40

by Mike Kravetz

[permalink] [raw]
Subject: Re: [RFC PATCH 1/5] mm, hugetlb: unify core page allocation accounting and initialization

On 12/04/2017 06:01 AM, Michal Hocko wrote:
> From: Michal Hocko <[email protected]>
>
> hugetlb allocator has two entry points to the page allocator
> - alloc_fresh_huge_page_node
> - __hugetlb_alloc_buddy_huge_page
>
> The two differ very subtly in two aspects. The first one doesn't care
> about HTLB_BUDDY_* stats and it doesn't initialize the huge page.
> prep_new_huge_page is not used because it not only initializes hugetlb
> specific stuff but because it also put_page and releases the page to
> the hugetlb pool which is not what is required in some contexts. This
> makes things more complicated than necessary.
>
> Simplify things by a) removing the page allocator entry point duplicity
> and only keep __hugetlb_alloc_buddy_huge_page and b) make
> prep_new_huge_page more reusable by removing the put_page which moves
> the page to the allocator pool. All current callers are updated to call
> put_page explicitly. Later patches will add new callers which won't
> need it.
>
> This patch shouldn't introduce any functional change.
>
> Signed-off-by: Michal Hocko <[email protected]>

Reviewed-by: Mike Kravetz <[email protected]>
--
Mike Kravetz

> ---
> mm/hugetlb.c | 61 +++++++++++++++++++++++++++++-------------------------------
> 1 file changed, 29 insertions(+), 32 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 2c9033d39bfe..8189c92fac82 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1157,6 +1157,7 @@ static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
> if (page) {
> prep_compound_gigantic_page(page, huge_page_order(h));
> prep_new_huge_page(h, page, nid);
> + put_page(page); /* free it into the hugepage allocator */
> }
>
> return page;
> @@ -1304,7 +1305,6 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
> h->nr_huge_pages++;
> h->nr_huge_pages_node[nid]++;
> spin_unlock(&hugetlb_lock);
> - put_page(page); /* free it into the hugepage allocator */
> }
>
> static void prep_compound_gigantic_page(struct page *page, unsigned int order)
> @@ -1381,41 +1381,49 @@ pgoff_t __basepage_index(struct page *page)
> return (index << compound_order(page_head)) + compound_idx;
> }
>
> -static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
> +static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
> + gfp_t gfp_mask, int nid, nodemask_t *nmask)
> {
> + int order = huge_page_order(h);
> struct page *page;
>
> - page = __alloc_pages_node(nid,
> - htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
> - __GFP_RETRY_MAYFAIL|__GFP_NOWARN,
> - huge_page_order(h));
> - if (page) {
> - prep_new_huge_page(h, page, nid);
> - }
> + gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
> + if (nid == NUMA_NO_NODE)
> + nid = numa_mem_id();
> + page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
> + if (page)
> + __count_vm_event(HTLB_BUDDY_PGALLOC);
> + else
> + __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
>
> return page;
> }
>
> +/*
> + * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
> + * manner.
> + */
> static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
> {
> struct page *page;
> int nr_nodes, node;
> - int ret = 0;
> + gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
>
> for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
> - page = alloc_fresh_huge_page_node(h, node);
> - if (page) {
> - ret = 1;
> + page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask,
> + node, nodes_allowed);
> + if (page)
> break;
> - }
> +
> }
>
> - if (ret)
> - count_vm_event(HTLB_BUDDY_PGALLOC);
> - else
> - count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
> + if (!page)
> + return 0;
>
> - return ret;
> + prep_new_huge_page(h, page, page_to_nid(page));
> + put_page(page); /* free it into the hugepage allocator */
> +
> + return 1;
> }
>
> /*
> @@ -1523,17 +1531,6 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
> return rc;
> }
>
> -static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
> - gfp_t gfp_mask, int nid, nodemask_t *nmask)
> -{
> - int order = huge_page_order(h);
> -
> - gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
> - if (nid == NUMA_NO_NODE)
> - nid = numa_mem_id();
> - return __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
> -}
> -
> static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
> int nid, nodemask_t *nmask)
> {
> @@ -1589,11 +1586,9 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
> */
> h->nr_huge_pages_node[r_nid]++;
> h->surplus_huge_pages_node[r_nid]++;
> - __count_vm_event(HTLB_BUDDY_PGALLOC);
> } else {
> h->nr_huge_pages--;
> h->surplus_huge_pages--;
> - __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
> }
> spin_unlock(&hugetlb_lock);
>
> @@ -2148,6 +2143,8 @@ static void __init gather_bootmem_prealloc(void)
> prep_compound_huge_page(page, h->order);
> WARN_ON(PageReserved(page));
> prep_new_huge_page(h, page, page_to_nid(page));
> + put_page(page); /* free it into the hugepage allocator */
> +
> /*
> * If we had gigantic hugepages allocated at boot time, we need
> * to restore the 'stolen' pages to totalram_pages in order to
>

2017-12-13 00:34:41

by Mike Kravetz

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] mm, hugetlb: integrate giga hugetlb more naturally to the allocation path

On 12/04/2017 06:01 AM, Michal Hocko wrote:
> From: Michal Hocko <[email protected]>
>
> Gigantic hugetlb pages were ingrown to the hugetlb code as an alien
> specie with a lot of special casing. The allocation path is not an
> exception. Unnecessarily so to be honest. It is true that the underlying
> allocator is different but that is an implementation detail.
>
> This patch unifies the hugetlb allocation path that a prepares fresh
> pool pages. alloc_fresh_gigantic_page basically copies alloc_fresh_huge_page
> logic so we can move everything there. This will simplify set_max_huge_pages
> which doesn't have to care about what kind of huge page we allocate.
>
> Signed-off-by: Michal Hocko <[email protected]>

I agree with the analysis. Thanks for cleaning this up. There really is
no need for the separate allocation paths.

Reviewed-by: Mike Kravetz <[email protected]>
--
Mike Kravetz

> ---
> mm/hugetlb.c | 53 ++++++++++++-----------------------------------------
> 1 file changed, 12 insertions(+), 41 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 8189c92fac82..ac105fb32620 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1106,7 +1106,8 @@ static bool zone_spans_last_pfn(const struct zone *zone,
> return zone_spans_pfn(zone, last_pfn);
> }
>
> -static struct page *alloc_gigantic_page(int nid, struct hstate *h)
> +static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
> + int nid, nodemask_t *nodemask)
> {
> unsigned int order = huge_page_order(h);
> unsigned long nr_pages = 1 << order;
> @@ -1114,11 +1115,9 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h)
> struct zonelist *zonelist;
> struct zone *zone;
> struct zoneref *z;
> - gfp_t gfp_mask;
>
> - gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
> zonelist = node_zonelist(nid, gfp_mask);
> - for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) {
> + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
> spin_lock_irqsave(&zone->lock, flags);
>
> pfn = ALIGN(zone->zone_start_pfn, nr_pages);
> @@ -1149,42 +1148,11 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h)
> static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
> static void prep_compound_gigantic_page(struct page *page, unsigned int order);
>
> -static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
> -{
> - struct page *page;
> -
> - page = alloc_gigantic_page(nid, h);
> - if (page) {
> - prep_compound_gigantic_page(page, huge_page_order(h));
> - prep_new_huge_page(h, page, nid);
> - put_page(page); /* free it into the hugepage allocator */
> - }
> -
> - return page;
> -}
> -
> -static int alloc_fresh_gigantic_page(struct hstate *h,
> - nodemask_t *nodes_allowed)
> -{
> - struct page *page = NULL;
> - int nr_nodes, node;
> -
> - for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
> - page = alloc_fresh_gigantic_page_node(h, node);
> - if (page)
> - return 1;
> - }
> -
> - return 0;
> -}
> -
> #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
> static inline bool gigantic_page_supported(void) { return false; }
> static inline void free_gigantic_page(struct page *page, unsigned int order) { }
> static inline void destroy_compound_gigantic_page(struct page *page,
> unsigned int order) { }
> -static inline int alloc_fresh_gigantic_page(struct hstate *h,
> - nodemask_t *nodes_allowed) { return 0; }
> #endif
>
> static void update_and_free_page(struct hstate *h, struct page *page)
> @@ -1410,8 +1378,12 @@ static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
> gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
>
> for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
> - page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask,
> - node, nodes_allowed);
> + if (hstate_is_gigantic(h))
> + page = alloc_gigantic_page(h, gfp_mask,
> + node, nodes_allowed);
> + else
> + page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask,
> + node, nodes_allowed);
> if (page)
> break;
>
> @@ -1420,6 +1392,8 @@ static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
> if (!page)
> return 0;
>
> + if (hstate_is_gigantic(h))
> + prep_compound_gigantic_page(page, huge_page_order(h));
> prep_new_huge_page(h, page, page_to_nid(page));
> put_page(page); /* free it into the hugepage allocator */
>
> @@ -2307,10 +2281,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
> /* yield cpu to avoid soft lockup */
> cond_resched();
>
> - if (hstate_is_gigantic(h))
> - ret = alloc_fresh_gigantic_page(h, nodes_allowed);
> - else
> - ret = alloc_fresh_huge_page(h, nodes_allowed);
> + ret = alloc_fresh_huge_page(h, nodes_allowed);
> spin_lock(&hugetlb_lock);
> if (!ret)
> goto out;
>

2017-12-13 23:35:53

by Mike Kravetz

[permalink] [raw]
Subject: Re: [RFC PATCH 3/5] mm, hugetlb: do not rely on overcommit limit during migration

On 12/04/2017 06:01 AM, Michal Hocko wrote:
> From: Michal Hocko <[email protected]>
>
> hugepage migration relies on __alloc_buddy_huge_page to get a new page.
> This has 2 main disadvantages.
> 1) it doesn't allow to migrate any huge page if the pool is used
> completely which is not an exceptional case as the pool is static and
> unused memory is just wasted.
> 2) it leads to a weird semantic when migration between two numa nodes
> might increase the pool size of the destination NUMA node while the page
> is in use. The issue is caused by per NUMA node surplus pages tracking
> (see free_huge_page).
>
> Address both issues by changing the way how we allocate and account
> pages allocated for migration. Those should temporal by definition.
> So we mark them that way (we will abuse page flags in the 3rd page)
> and update free_huge_page to free such pages to the page allocator.
> Page migration path then just transfers the temporal status from the
> new page to the old one which will be freed on the last reference.
> The global surplus count will never change during this path

The global and per-node user visible count of huge pages will be
temporarily increased by one during this path. This should not
be an issue.

> but we still
> have to be careful when migrating a per-node suprlus page. This is now
> handled in move_hugetlb_state which is called from the migration path
> and it copies the hugetlb specific page state and fixes up the
> accounting when needed
>
> Rename __alloc_buddy_huge_page to __alloc_surplus_huge_page to better
> reflect its purpose. The new allocation routine for the migration path
> is __alloc_migrate_huge_page.
>
> The user visible effect of this patch is that migrated pages are really
> temporal and they travel between NUMA nodes as per the migration
> request:
> Before migration
> /sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages:0
> /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages:1
> /sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages:0
> /sys/devices/system/node/node1/hugepages/hugepages-2048kB/free_hugepages:0
> /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages:0
> /sys/devices/system/node/node1/hugepages/hugepages-2048kB/surplus_hugepages:0
>
> After
>
> /sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages:0
> /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages:0
> /sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages:0
> /sys/devices/system/node/node1/hugepages/hugepages-2048kB/free_hugepages:0
> /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages:1
> /sys/devices/system/node/node1/hugepages/hugepages-2048kB/surplus_hugepages:0
>
> with the previous implementation, both nodes would have nr_hugepages:1
> until the page is freed.

With the previous implementation, the migration would have failed unless
nr_overcommit_hugepages was explicitly set. Correct?

>
> Signed-off-by: Michal Hocko <[email protected]>
> ---
> include/linux/hugetlb.h | 3 ++
> mm/hugetlb.c | 111 +++++++++++++++++++++++++++++++++++++++++-------
> mm/migrate.c | 3 +-
> 3 files changed, 99 insertions(+), 18 deletions(-)
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 6e3696c7b35a..1a9c89850e4a 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -119,6 +119,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
> long freed);
> bool isolate_huge_page(struct page *page, struct list_head *list);
> void putback_active_hugepage(struct page *page);
> +void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
> void free_huge_page(struct page *page);
> void hugetlb_fix_reserve_counts(struct inode *inode);
> extern struct mutex *hugetlb_fault_mutex_table;
> @@ -157,6 +158,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
> unsigned long address, unsigned long end, pgprot_t newprot);
>
> bool is_hugetlb_entry_migration(pte_t pte);
> +
> #else /* !CONFIG_HUGETLB_PAGE */
>
> static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
> @@ -197,6 +199,7 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list)
> return false;
> }
> #define putback_active_hugepage(p) do {} while (0)
> +#define move_hugetlb_state(old, new, reason) do {} while (0)
>
> static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
> unsigned long address, unsigned long end, pgprot_t newprot)
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index ac105fb32620..a1b8b2888ec9 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -34,6 +34,7 @@
> #include <linux/hugetlb_cgroup.h>
> #include <linux/node.h>
> #include <linux/userfaultfd_k.h>
> +#include <linux/page_owner.h>
> #include "internal.h"
>
> int hugetlb_max_hstate __read_mostly;
> @@ -1217,6 +1218,28 @@ static void clear_page_huge_active(struct page *page)
> ClearPagePrivate(&page[1]);
> }
>
> +/*
> + * Internal hugetlb specific page flag. Do not use outside of the hugetlb
> + * code
> + */
> +static inline bool PageHugeTemporary(struct page *page)
> +{
> + if (!PageHuge(page))
> + return false;
> +
> + return (unsigned long)page[2].mapping == -1U;
> +}
> +
> +static inline void SetPageHugeTemporary(struct page *page)
> +{
> + page[2].mapping = (void *)-1U;
> +}
> +
> +static inline void ClearPageHugeTemporary(struct page *page)
> +{
> + page[2].mapping = NULL;
> +}
> +
> void free_huge_page(struct page *page)
> {
> /*
> @@ -1251,7 +1274,11 @@ void free_huge_page(struct page *page)
> if (restore_reserve)
> h->resv_huge_pages++;
>
> - if (h->surplus_huge_pages_node[nid]) {
> + if (PageHugeTemporary(page)) {
> + list_del(&page->lru);
> + ClearPageHugeTemporary(page);
> + update_and_free_page(h, page);
> + } else if (h->surplus_huge_pages_node[nid]) {
> /* remove the page from active list */
> list_del(&page->lru);
> update_and_free_page(h, page);
> @@ -1505,7 +1532,10 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
> return rc;
> }
>
> -static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
> +/*
> + * Allocates a fresh surplus page from the page allocator.
> + */
> +static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
> int nid, nodemask_t *nmask)
> {
> struct page *page;
> @@ -1569,6 +1599,28 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
> return page;
> }
>
> +static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
> + int nid, nodemask_t *nmask)
> +{
> + struct page *page;
> +
> + if (hstate_is_gigantic(h))
> + return NULL;
> +
> + page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
> + if (!page)
> + return NULL;
> +
> + /*
> + * We do not account these pages as surplus because they are only
> + * temporary and will be released properly on the last reference
> + */
> + prep_new_huge_page(h, page, page_to_nid(page));
> + SetPageHugeTemporary(page);
> +
> + return page;
> +}
> +
> /*
> * Use the VMA's mpolicy to allocate a huge page from the buddy.
> */
> @@ -1583,17 +1635,13 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
> nodemask_t *nodemask;
>
> nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
> - page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask);
> + page = __alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
> mpol_cond_put(mpol);
>
> return page;
> }
>
> -/*
> - * This allocation function is useful in the context where vma is irrelevant.
> - * E.g. soft-offlining uses this function because it only cares physical
> - * address of error page.
> - */
> +/* page migration callback function */
> struct page *alloc_huge_page_node(struct hstate *h, int nid)
> {
> gfp_t gfp_mask = htlb_alloc_mask(h);
> @@ -1608,12 +1656,12 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
> spin_unlock(&hugetlb_lock);
>
> if (!page)
> - page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL);
> + page = __alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
>
> return page;
> }
>
> -
> +/* page migration callback function */
> struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
> nodemask_t *nmask)
> {
> @@ -1631,9 +1679,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
> }
> spin_unlock(&hugetlb_lock);
>
> - /* No reservations, try to overcommit */
> -
> - return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask);
> + return __alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
> }
>
> /*
> @@ -1661,7 +1707,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
> retry:
> spin_unlock(&hugetlb_lock);
> for (i = 0; i < needed; i++) {
> - page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h),
> + page = __alloc_surplus_huge_page(h, htlb_alloc_mask(h),
> NUMA_NO_NODE, NULL);
> if (!page) {
> alloc_ok = false;
> @@ -2258,7 +2304,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
> * First take pages out of surplus state. Then make up the
> * remaining difference by allocating fresh huge pages.
> *
> - * We might race with __alloc_buddy_huge_page() here and be unable
> + * We might race with __alloc_surplus_huge_page() here and be unable
> * to convert a surplus huge page to a normal huge page. That is
> * not critical, though, it just means the overall size of the
> * pool might be one hugepage larger than it needs to be, but
> @@ -2301,7 +2347,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
> * By placing pages into the surplus state independent of the
> * overcommit value, we are allowing the surplus pool size to
> * exceed overcommit. There are few sane options here. Since
> - * __alloc_buddy_huge_page() is checking the global counter,
> + * __alloc_surplus_huge_page() is checking the global counter,
> * though, we'll note that we're not allowed to exceed surplus
> * and won't grow the pool anywhere else. Not until one of the
> * sysctls are changed, or the surplus pages go out of use.
> @@ -4775,3 +4821,36 @@ void putback_active_hugepage(struct page *page)
> spin_unlock(&hugetlb_lock);
> put_page(page);
> }
> +
> +void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
> +{
> + struct hstate *h = page_hstate(oldpage);
> +
> + hugetlb_cgroup_migrate(oldpage, newpage);
> + set_page_owner_migrate_reason(newpage, reason);
> +
> + /*
> + * transfer temporary state of the new huge page. This is
> + * reverse to other transitions because the newpage is going to
> + * be final while the old one will be freed so it takes over
> + * the temporary status.
> + *
> + * Also note that we have to transfer the per-node surplus state
> + * here as well otherwise the global surplus count will not match
> + * the per-node's.
> + */
> + if (PageHugeTemporary(newpage)) {
> + int old_nid = page_to_nid(oldpage);
> + int new_nid = page_to_nid(newpage);
> +
> + SetPageHugeTemporary(oldpage);
> + ClearPageHugeTemporary(newpage);
> +
> + spin_lock(&hugetlb_lock);
> + if (h->surplus_huge_pages_node[old_nid]) {
> + h->surplus_huge_pages_node[old_nid]--;
> + h->surplus_huge_pages_node[new_nid]++;
> + }
> + spin_unlock(&hugetlb_lock);
> + }
> +}

In the previous version of this patch, I asked about handling of 'free' huge
pages. I did a little digging and IIUC, we do not attempt migration of
free huge pages. The routine isolate_huge_page() has this check:

if (!page_huge_active(page) || !get_page_unless_zero(page)) {
ret = false;
goto unlock;
}

I believe one of your motivations for this effort was memory offlining.
So, this implies that a memory area can not be offlined if it contains
a free (not in use) huge page? Just FYI and may be something we want to
address later.

My other issues were addressed.

Reviewed-by: Mike Kravetz <[email protected]>
--
Mike Kravetz

> diff --git a/mm/migrate.c b/mm/migrate.c
> index 4d0be47a322a..1e5525a25691 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1323,9 +1323,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
> put_anon_vma(anon_vma);
>
> if (rc == MIGRATEPAGE_SUCCESS) {
> - hugetlb_cgroup_migrate(hpage, new_hpage);
> + move_hugetlb_state(hpage, new_hpage, reason);
> put_new_page = NULL;
> - set_page_owner_migrate_reason(new_hpage, reason);
> }
>
> unlock_page(hpage);
>

2017-12-14 00:46:54

by Mike Kravetz

[permalink] [raw]
Subject: Re: [RFC PATCH 4/5] mm, hugetlb: get rid of surplus page accounting tricks

On 12/04/2017 06:01 AM, Michal Hocko wrote:
> From: Michal Hocko <[email protected]>
>
> alloc_surplus_huge_page increases the pool size and the number of
> surplus pages opportunistically to prevent from races with the pool size
> change. See d1c3fb1f8f29 ("hugetlb: introduce nr_overcommit_hugepages
> sysctl") for more details.
>
> The resulting code is unnecessarily hairy, cause code duplication and
> doesn't allow to share the allocation paths. Moreover pool size changes
> tend to be very seldom so optimizing for them is not really reasonable.
> Simplify the code and allow to allocate a fresh surplus page as long as
> we are under the overcommit limit and then recheck the condition after
> the allocation and drop the new page if the situation has changed. This
> should provide a reasonable guarantee that an abrupt allocation requests
> will not go way off the limit.
>
> If we consider races with the pool shrinking and enlarging then we
> should be reasonably safe as well. In the first case we are off by one
> in the worst case and the second case should work OK because the page is
> not yet visible. We can waste CPU cycles for the allocation but that
> should be acceptable for a relatively rare condition.
>
> Signed-off-by: Michal Hocko <[email protected]>
> ---
> mm/hugetlb.c | 60 +++++++++++++++++++++---------------------------------------
> 1 file changed, 21 insertions(+), 39 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index a1b8b2888ec9..0c7dc269b6c0 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1538,62 +1538,44 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
> static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
> int nid, nodemask_t *nmask)
> {
> - struct page *page;
> - unsigned int r_nid;
> + struct page *page = NULL;
>
> if (hstate_is_gigantic(h))
> return NULL;
>
> - /*
> - * Assume we will successfully allocate the surplus page to
> - * prevent racing processes from causing the surplus to exceed
> - * overcommit
> - *
> - * This however introduces a different race, where a process B
> - * tries to grow the static hugepage pool while alloc_pages() is
> - * called by process A. B will only examine the per-node
> - * counters in determining if surplus huge pages can be
> - * converted to normal huge pages in adjust_pool_surplus(). A
> - * won't be able to increment the per-node counter, until the
> - * lock is dropped by B, but B doesn't drop hugetlb_lock until
> - * no more huge pages can be converted from surplus to normal
> - * state (and doesn't try to convert again). Thus, we have a
> - * case where a surplus huge page exists, the pool is grown, and
> - * the surplus huge page still exists after, even though it
> - * should just have been converted to a normal huge page. This
> - * does not leak memory, though, as the hugepage will be freed
> - * once it is out of use. It also does not allow the counters to
> - * go out of whack in adjust_pool_surplus() as we don't modify
> - * the node values until we've gotten the hugepage and only the
> - * per-node value is checked there.
> - */
> spin_lock(&hugetlb_lock);
> - if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
> - spin_unlock(&hugetlb_lock);
> - return NULL;
> - } else {
> - h->nr_huge_pages++;
> - h->surplus_huge_pages++;
> - }
> + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
> + goto out_unlock;
> spin_unlock(&hugetlb_lock);
>
> page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
> + if (!page)
> + goto out_unlock;
>
> spin_lock(&hugetlb_lock);
> - if (page) {
> + /*
> + * We could have raced with the pool size change.
> + * Double check that and simply deallocate the new page
> + * if we would end up overcommiting the surpluses. Abuse
> + * temporary page to workaround the nasty free_huge_page
> + * codeflow
> + */
> + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
> + SetPageHugeTemporary(page);
> + put_page(page);
> + page = NULL;
> + } else {
> + h->surplus_huge_pages_node[page_to_nid(page)]++;
> + h->surplus_huge_pages++;
> INIT_LIST_HEAD(&page->lru);
> r_nid = page_to_nid(page);
> set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
> set_hugetlb_cgroup(page, NULL);
> - /*
> - * We incremented the global counters already
> - */
> h->nr_huge_pages_node[r_nid]++;
> h->surplus_huge_pages_node[r_nid]++;
> - } else {
> - h->nr_huge_pages--;
> - h->surplus_huge_pages--;

In the case of a successful surplus allocation, the following counters
are incremented:

h->surplus_huge_pages_node[page_to_nid(page)]++;
h->surplus_huge_pages++;
h->nr_huge_pages_node[r_nid]++;
h->surplus_huge_pages_node[r_nid]++;

Looks like per-node surplus_huge_pages_node is incremented twice, and
global nr_huge_pages is not incremented at all.

Also, you removed r_nid so I'm guessing this will not compile?
--
Mike Kravetz


> }
> +
> +out_unlock:
> spin_unlock(&hugetlb_lock);
>
> return page;
>

2017-12-14 07:40:56

by Michal Hocko

[permalink] [raw]
Subject: Re: [RFC PATCH 3/5] mm, hugetlb: do not rely on overcommit limit during migration

On Wed 13-12-17 15:35:33, Mike Kravetz wrote:
> On 12/04/2017 06:01 AM, Michal Hocko wrote:
[...]
> > Before migration
> > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages:0
> > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages:1
> > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages:0
> > /sys/devices/system/node/node1/hugepages/hugepages-2048kB/free_hugepages:0
> > /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages:0
> > /sys/devices/system/node/node1/hugepages/hugepages-2048kB/surplus_hugepages:0
> >
> > After
> >
> > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages:0
> > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages:0
> > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages:0
> > /sys/devices/system/node/node1/hugepages/hugepages-2048kB/free_hugepages:0
> > /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages:1
> > /sys/devices/system/node/node1/hugepages/hugepages-2048kB/surplus_hugepages:0
> >
> > with the previous implementation, both nodes would have nr_hugepages:1
> > until the page is freed.
>
> With the previous implementation, the migration would have failed unless
> nr_overcommit_hugepages was explicitly set. Correct?

yes

[...]

> In the previous version of this patch, I asked about handling of 'free' huge
> pages. I did a little digging and IIUC, we do not attempt migration of
> free huge pages. The routine isolate_huge_page() has this check:
>
> if (!page_huge_active(page) || !get_page_unless_zero(page)) {
> ret = false;
> goto unlock;
> }
>
> I believe one of your motivations for this effort was memory offlining.
> So, this implies that a memory area can not be offlined if it contains
> a free (not in use) huge page?

do_migrate_range will ignore this free huge page and then we will free
it up in dissolve_free_huge_pages

> Just FYI and may be something we want to address later.

Maybe yes. The free pool might be reserved which would make
dissolve_free_huge_pages to fail. Maybe we can be more clever and
allocate a new huge page in that case.

> My other issues were addressed.
>
> Reviewed-by: Mike Kravetz <[email protected]>

Thanks!
--
Michal Hocko
SUSE Labs

2017-12-14 07:50:14

by Michal Hocko

[permalink] [raw]
Subject: Re: [RFC PATCH 4/5] mm, hugetlb: get rid of surplus page accounting tricks

On Wed 13-12-17 16:45:55, Mike Kravetz wrote:
> On 12/04/2017 06:01 AM, Michal Hocko wrote:
> > From: Michal Hocko <[email protected]>
> >
> > alloc_surplus_huge_page increases the pool size and the number of
> > surplus pages opportunistically to prevent from races with the pool size
> > change. See d1c3fb1f8f29 ("hugetlb: introduce nr_overcommit_hugepages
> > sysctl") for more details.
> >
> > The resulting code is unnecessarily hairy, cause code duplication and
> > doesn't allow to share the allocation paths. Moreover pool size changes
> > tend to be very seldom so optimizing for them is not really reasonable.
> > Simplify the code and allow to allocate a fresh surplus page as long as
> > we are under the overcommit limit and then recheck the condition after
> > the allocation and drop the new page if the situation has changed. This
> > should provide a reasonable guarantee that an abrupt allocation requests
> > will not go way off the limit.
> >
> > If we consider races with the pool shrinking and enlarging then we
> > should be reasonably safe as well. In the first case we are off by one
> > in the worst case and the second case should work OK because the page is
> > not yet visible. We can waste CPU cycles for the allocation but that
> > should be acceptable for a relatively rare condition.
> >
> > Signed-off-by: Michal Hocko <[email protected]>
> > ---
> > mm/hugetlb.c | 60 +++++++++++++++++++++---------------------------------------
> > 1 file changed, 21 insertions(+), 39 deletions(-)
> >
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index a1b8b2888ec9..0c7dc269b6c0 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1538,62 +1538,44 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
> > static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
> > int nid, nodemask_t *nmask)
> > {
> > - struct page *page;
> > - unsigned int r_nid;
> > + struct page *page = NULL;
> >
> > if (hstate_is_gigantic(h))
> > return NULL;
> >
> > - /*
> > - * Assume we will successfully allocate the surplus page to
> > - * prevent racing processes from causing the surplus to exceed
> > - * overcommit
> > - *
> > - * This however introduces a different race, where a process B
> > - * tries to grow the static hugepage pool while alloc_pages() is
> > - * called by process A. B will only examine the per-node
> > - * counters in determining if surplus huge pages can be
> > - * converted to normal huge pages in adjust_pool_surplus(). A
> > - * won't be able to increment the per-node counter, until the
> > - * lock is dropped by B, but B doesn't drop hugetlb_lock until
> > - * no more huge pages can be converted from surplus to normal
> > - * state (and doesn't try to convert again). Thus, we have a
> > - * case where a surplus huge page exists, the pool is grown, and
> > - * the surplus huge page still exists after, even though it
> > - * should just have been converted to a normal huge page. This
> > - * does not leak memory, though, as the hugepage will be freed
> > - * once it is out of use. It also does not allow the counters to
> > - * go out of whack in adjust_pool_surplus() as we don't modify
> > - * the node values until we've gotten the hugepage and only the
> > - * per-node value is checked there.
> > - */
> > spin_lock(&hugetlb_lock);
> > - if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
> > - spin_unlock(&hugetlb_lock);
> > - return NULL;
> > - } else {
> > - h->nr_huge_pages++;
> > - h->surplus_huge_pages++;
> > - }
> > + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
> > + goto out_unlock;
> > spin_unlock(&hugetlb_lock);
> >
> > page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
> > + if (!page)
> > + goto out_unlock;
> >
> > spin_lock(&hugetlb_lock);
> > - if (page) {
> > + /*
> > + * We could have raced with the pool size change.
> > + * Double check that and simply deallocate the new page
> > + * if we would end up overcommiting the surpluses. Abuse
> > + * temporary page to workaround the nasty free_huge_page
> > + * codeflow
> > + */
> > + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
> > + SetPageHugeTemporary(page);
> > + put_page(page);
> > + page = NULL;
> > + } else {
> > + h->surplus_huge_pages_node[page_to_nid(page)]++;
> > + h->surplus_huge_pages++;
> > INIT_LIST_HEAD(&page->lru);
> > r_nid = page_to_nid(page);
> > set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
> > set_hugetlb_cgroup(page, NULL);
> > - /*
> > - * We incremented the global counters already
> > - */
> > h->nr_huge_pages_node[r_nid]++;
> > h->surplus_huge_pages_node[r_nid]++;
> > - } else {
> > - h->nr_huge_pages--;
> > - h->surplus_huge_pages--;
>
> In the case of a successful surplus allocation, the following counters
> are incremented:
>
> h->surplus_huge_pages_node[page_to_nid(page)]++;
> h->surplus_huge_pages++;
> h->nr_huge_pages_node[r_nid]++;
> h->surplus_huge_pages_node[r_nid]++;
>
> Looks like per-node surplus_huge_pages_node is incremented twice, and
> global nr_huge_pages is not incremented at all.
>
> Also, you removed r_nid so I'm guessing this will not compile?

Ups a hickup during the rebase/split up. The following code removes all
this so I haven't noticed. Thanks for catching that!
The incremental diff
---
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 41d2d9082f0d..3c16cde72ceb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1565,8 +1565,10 @@ static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
put_page(page);
page = NULL;
} else {
- h->surplus_huge_pages_node[page_to_nid(page)]++;
+ int r_nid;
+
h->surplus_huge_pages++;
+ h->nr_huge_pages++;
INIT_LIST_HEAD(&page->lru);
r_nid = page_to_nid(page);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
--
Michal Hocko
SUSE Labs

2017-12-14 20:58:09

by Mike Kravetz

[permalink] [raw]
Subject: Re: [RFC PATCH 3/5] mm, hugetlb: do not rely on overcommit limit during migration

On 12/13/2017 11:40 PM, Michal Hocko wrote:
> On Wed 13-12-17 15:35:33, Mike Kravetz wrote:
>> On 12/04/2017 06:01 AM, Michal Hocko wrote:
> [...]
>>> Before migration
>>> /sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages:0
>>> /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages:1
>>> /sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages:0
>>> /sys/devices/system/node/node1/hugepages/hugepages-2048kB/free_hugepages:0
>>> /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages:0
>>> /sys/devices/system/node/node1/hugepages/hugepages-2048kB/surplus_hugepages:0
>>>
>>> After
>>>
>>> /sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages:0
>>> /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages:0
>>> /sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages:0
>>> /sys/devices/system/node/node1/hugepages/hugepages-2048kB/free_hugepages:0
>>> /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages:1
>>> /sys/devices/system/node/node1/hugepages/hugepages-2048kB/surplus_hugepages:0
>>>
>>> with the previous implementation, both nodes would have nr_hugepages:1
>>> until the page is freed.
>>
>> With the previous implementation, the migration would have failed unless
>> nr_overcommit_hugepages was explicitly set. Correct?
>
> yes
>
> [...]
>
>> In the previous version of this patch, I asked about handling of 'free' huge
>> pages. I did a little digging and IIUC, we do not attempt migration of
>> free huge pages. The routine isolate_huge_page() has this check:
>>
>> if (!page_huge_active(page) || !get_page_unless_zero(page)) {
>> ret = false;
>> goto unlock;
>> }
>>
>> I believe one of your motivations for this effort was memory offlining.
>> So, this implies that a memory area can not be offlined if it contains
>> a free (not in use) huge page?
>
> do_migrate_range will ignore this free huge page and then we will free
> it up in dissolve_free_huge_pages
>
>> Just FYI and may be something we want to address later.
>
> Maybe yes. The free pool might be reserved which would make
> dissolve_free_huge_pages to fail. Maybe we can be more clever and
> allocate a new huge page in that case.

Don't think we need to try and do anything more clever right now. I was
just a little confused about the hot plug code. Thanks for the explanation.

--
Mike Kravetz

>
>> My other issues were addressed.
>>
>> Reviewed-by: Mike Kravetz <[email protected]>
>
> Thanks!
>

2017-12-14 20:59:10

by Mike Kravetz

[permalink] [raw]
Subject: Re: [RFC PATCH 4/5] mm, hugetlb: get rid of surplus page accounting tricks

On 12/13/2017 11:50 PM, Michal Hocko wrote:
> On Wed 13-12-17 16:45:55, Mike Kravetz wrote:
>> On 12/04/2017 06:01 AM, Michal Hocko wrote:
>>> From: Michal Hocko <[email protected]>
>>>
>>> alloc_surplus_huge_page increases the pool size and the number of
>>> surplus pages opportunistically to prevent from races with the pool size
>>> change. See d1c3fb1f8f29 ("hugetlb: introduce nr_overcommit_hugepages
>>> sysctl") for more details.
>>>
>>> The resulting code is unnecessarily hairy, cause code duplication and
>>> doesn't allow to share the allocation paths. Moreover pool size changes
>>> tend to be very seldom so optimizing for them is not really reasonable.
>>> Simplify the code and allow to allocate a fresh surplus page as long as
>>> we are under the overcommit limit and then recheck the condition after
>>> the allocation and drop the new page if the situation has changed. This
>>> should provide a reasonable guarantee that an abrupt allocation requests
>>> will not go way off the limit.
>>>
>>> If we consider races with the pool shrinking and enlarging then we
>>> should be reasonably safe as well. In the first case we are off by one
>>> in the worst case and the second case should work OK because the page is
>>> not yet visible. We can waste CPU cycles for the allocation but that
>>> should be acceptable for a relatively rare condition.
>>>
>>> Signed-off-by: Michal Hocko <[email protected]>
>>> ---
>>> mm/hugetlb.c | 60 +++++++++++++++++++++---------------------------------------
>>> 1 file changed, 21 insertions(+), 39 deletions(-)
>>>
>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>>> index a1b8b2888ec9..0c7dc269b6c0 100644
>>> --- a/mm/hugetlb.c
>>> +++ b/mm/hugetlb.c
>>> @@ -1538,62 +1538,44 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
>>> static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
>>> int nid, nodemask_t *nmask)
>>> {
>>> - struct page *page;
>>> - unsigned int r_nid;
>>> + struct page *page = NULL;
>>>
>>> if (hstate_is_gigantic(h))
>>> return NULL;
>>>
>>> - /*
>>> - * Assume we will successfully allocate the surplus page to
>>> - * prevent racing processes from causing the surplus to exceed
>>> - * overcommit
>>> - *
>>> - * This however introduces a different race, where a process B
>>> - * tries to grow the static hugepage pool while alloc_pages() is
>>> - * called by process A. B will only examine the per-node
>>> - * counters in determining if surplus huge pages can be
>>> - * converted to normal huge pages in adjust_pool_surplus(). A
>>> - * won't be able to increment the per-node counter, until the
>>> - * lock is dropped by B, but B doesn't drop hugetlb_lock until
>>> - * no more huge pages can be converted from surplus to normal
>>> - * state (and doesn't try to convert again). Thus, we have a
>>> - * case where a surplus huge page exists, the pool is grown, and
>>> - * the surplus huge page still exists after, even though it
>>> - * should just have been converted to a normal huge page. This
>>> - * does not leak memory, though, as the hugepage will be freed
>>> - * once it is out of use. It also does not allow the counters to
>>> - * go out of whack in adjust_pool_surplus() as we don't modify
>>> - * the node values until we've gotten the hugepage and only the
>>> - * per-node value is checked there.
>>> - */
>>> spin_lock(&hugetlb_lock);
>>> - if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
>>> - spin_unlock(&hugetlb_lock);
>>> - return NULL;
>>> - } else {
>>> - h->nr_huge_pages++;
>>> - h->surplus_huge_pages++;
>>> - }
>>> + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
>>> + goto out_unlock;
>>> spin_unlock(&hugetlb_lock);
>>>
>>> page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
>>> + if (!page)
>>> + goto out_unlock;
>>>
>>> spin_lock(&hugetlb_lock);
>>> - if (page) {
>>> + /*
>>> + * We could have raced with the pool size change.
>>> + * Double check that and simply deallocate the new page
>>> + * if we would end up overcommiting the surpluses. Abuse
>>> + * temporary page to workaround the nasty free_huge_page
>>> + * codeflow
>>> + */
>>> + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
>>> + SetPageHugeTemporary(page);
>>> + put_page(page);
>>> + page = NULL;
>>> + } else {
>>> + h->surplus_huge_pages_node[page_to_nid(page)]++;
>>> + h->surplus_huge_pages++;
>>> INIT_LIST_HEAD(&page->lru);
>>> r_nid = page_to_nid(page);
>>> set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
>>> set_hugetlb_cgroup(page, NULL);
>>> - /*
>>> - * We incremented the global counters already
>>> - */
>>> h->nr_huge_pages_node[r_nid]++;
>>> h->surplus_huge_pages_node[r_nid]++;
>>> - } else {
>>> - h->nr_huge_pages--;
>>> - h->surplus_huge_pages--;
>>
>> In the case of a successful surplus allocation, the following counters
>> are incremented:
>>
>> h->surplus_huge_pages_node[page_to_nid(page)]++;
>> h->surplus_huge_pages++;
>> h->nr_huge_pages_node[r_nid]++;
>> h->surplus_huge_pages_node[r_nid]++;
>>
>> Looks like per-node surplus_huge_pages_node is incremented twice, and
>> global nr_huge_pages is not incremented at all.
>>
>> Also, you removed r_nid so I'm guessing this will not compile?
>
> Ups a hickup during the rebase/split up. The following code removes all
> this so I haven't noticed. Thanks for catching that!
> The incremental diff
> ---
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 41d2d9082f0d..3c16cde72ceb 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1565,8 +1565,10 @@ static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
> put_page(page);
> page = NULL;
> } else {
> - h->surplus_huge_pages_node[page_to_nid(page)]++;
> + int r_nid;
> +
> h->surplus_huge_pages++;
> + h->nr_huge_pages++;
> INIT_LIST_HEAD(&page->lru);
> r_nid = page_to_nid(page);
> set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
>

With the incremental diff,

Reviewed-by: Mike Kravetz <[email protected]>
--
Mike Kravetz

2017-12-14 21:03:03

by Mike Kravetz

[permalink] [raw]
Subject: Re: [RFC PATCH 5/5] mm, hugetlb: further simplify hugetlb allocation API

On 12/04/2017 06:01 AM, Michal Hocko wrote:
> From: Michal Hocko <[email protected]>
>
> Hugetlb allocator has several layer of allocation functions depending
> and the purpose of the allocation. There are two allocators depending
> on whether the page can be allocated from the page allocator or we need
> a contiguous allocator. This is currently opencoded in alloc_fresh_huge_page
> which is the only path that might allocate giga pages which require the
> later allocator. Create alloc_fresh_huge_page which hides this
> implementation detail and use it in all callers which hardcoded the
> buddy allocator path (__hugetlb_alloc_buddy_huge_page). This shouldn't
> introduce any funtional change because both migration and surplus
> allocators exlude giga pages explicitly.
>
> While we are at it let's do some renaming. The current scheme is not
> consistent and overly painfull to read and understand. Get rid of prefix
> underscores from most functions. There is no real reason to make names
> longer.
> * alloc_fresh_huge_page is the new layer to abstract underlying
> allocator
> * __hugetlb_alloc_buddy_huge_page becomes shorter and neater
> alloc_buddy_huge_page.
> * Former alloc_fresh_huge_page becomes alloc_pool_huge_page because we put
> the new page directly to the pool
> * alloc_surplus_huge_page can drop the opencoded prep_new_huge_page code
> as it uses alloc_fresh_huge_page now
> * others lose their excessive prefix underscores to make names shorter
>

This patch will need to be modified to take into account the incremental
diff to patch 4 in this series. Other than that, the changes look good.

Reviewed-by: Mike Kravetz <[email protected]>
--
Mike Kravetz

> Signed-off-by: Michal Hocko <[email protected]>
> ---
> mm/hugetlb.c | 74 +++++++++++++++++++++++++++++++++---------------------------
> 1 file changed, 41 insertions(+), 33 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 0c7dc269b6c0..73c74851a304 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1376,7 +1376,7 @@ pgoff_t __basepage_index(struct page *page)
> return (index << compound_order(page_head)) + compound_idx;
> }
>
> -static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
> +static struct page *alloc_buddy_huge_page(struct hstate *h,
> gfp_t gfp_mask, int nid, nodemask_t *nmask)
> {
> int order = huge_page_order(h);
> @@ -1394,34 +1394,49 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
> return page;
> }
>
> +/*
> + * Common helper to allocate a fresh hugetlb page. All specific allocators
> + * should use this function to get new hugetlb pages
> + */
> +static struct page *alloc_fresh_huge_page(struct hstate *h,
> + gfp_t gfp_mask, int nid, nodemask_t *nmask)
> +{
> + struct page *page;
> +
> + if (hstate_is_gigantic(h))
> + page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
> + else
> + page = alloc_buddy_huge_page(h, gfp_mask,
> + nid, nmask);
> + if (!page)
> + return NULL;
> +
> + if (hstate_is_gigantic(h))
> + prep_compound_gigantic_page(page, huge_page_order(h));
> + prep_new_huge_page(h, page, page_to_nid(page));
> +
> + return page;
> +}
> +
> /*
> * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
> * manner.
> */
> -static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
> +static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
> {
> struct page *page;
> int nr_nodes, node;
> gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
>
> for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
> - if (hstate_is_gigantic(h))
> - page = alloc_gigantic_page(h, gfp_mask,
> - node, nodes_allowed);
> - else
> - page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask,
> - node, nodes_allowed);
> + page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed);
> if (page)
> break;
> -
> }
>
> if (!page)
> return 0;
>
> - if (hstate_is_gigantic(h))
> - prep_compound_gigantic_page(page, huge_page_order(h));
> - prep_new_huge_page(h, page, page_to_nid(page));
> put_page(page); /* free it into the hugepage allocator */
>
> return 1;
> @@ -1535,7 +1550,7 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
> /*
> * Allocates a fresh surplus page from the page allocator.
> */
> -static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
> +static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
> int nid, nodemask_t *nmask)
> {
> struct page *page = NULL;
> @@ -1548,7 +1563,7 @@ static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
> goto out_unlock;
> spin_unlock(&hugetlb_lock);
>
> - page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
> + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
> if (!page)
> goto out_unlock;
>
> @@ -1567,12 +1582,6 @@ static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
> } else {
> h->surplus_huge_pages_node[page_to_nid(page)]++;
> h->surplus_huge_pages++;
> - INIT_LIST_HEAD(&page->lru);
> - r_nid = page_to_nid(page);
> - set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
> - set_hugetlb_cgroup(page, NULL);
> - h->nr_huge_pages_node[r_nid]++;
> - h->surplus_huge_pages_node[r_nid]++;
> }
>
> out_unlock:
> @@ -1581,7 +1590,7 @@ static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
> return page;
> }
>
> -static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
> +static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
> int nid, nodemask_t *nmask)
> {
> struct page *page;
> @@ -1589,7 +1598,7 @@ static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
> if (hstate_is_gigantic(h))
> return NULL;
>
> - page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
> + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
> if (!page)
> return NULL;
>
> @@ -1597,7 +1606,6 @@ static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
> * We do not account these pages as surplus because they are only
> * temporary and will be released properly on the last reference
> */
> - prep_new_huge_page(h, page, page_to_nid(page));
> SetPageHugeTemporary(page);
>
> return page;
> @@ -1607,7 +1615,7 @@ static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
> * Use the VMA's mpolicy to allocate a huge page from the buddy.
> */
> static
> -struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
> +struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
> struct vm_area_struct *vma, unsigned long addr)
> {
> struct page *page;
> @@ -1617,7 +1625,7 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
> nodemask_t *nodemask;
>
> nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
> - page = __alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
> + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
> mpol_cond_put(mpol);
>
> return page;
> @@ -1638,7 +1646,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
> spin_unlock(&hugetlb_lock);
>
> if (!page)
> - page = __alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
> + page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
>
> return page;
> }
> @@ -1661,7 +1669,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
> }
> spin_unlock(&hugetlb_lock);
>
> - return __alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
> + return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
> }
>
> /*
> @@ -1689,7 +1697,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
> retry:
> spin_unlock(&hugetlb_lock);
> for (i = 0; i < needed; i++) {
> - page = __alloc_surplus_huge_page(h, htlb_alloc_mask(h),
> + page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
> NUMA_NO_NODE, NULL);
> if (!page) {
> alloc_ok = false;
> @@ -2026,7 +2034,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
> page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
> if (!page) {
> spin_unlock(&hugetlb_lock);
> - page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
> + page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
> if (!page)
> goto out_uncharge_cgroup;
> if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
> @@ -2166,7 +2174,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
> if (hstate_is_gigantic(h)) {
> if (!alloc_bootmem_huge_page(h))
> break;
> - } else if (!alloc_fresh_huge_page(h,
> + } else if (!alloc_pool_huge_page(h,
> &node_states[N_MEMORY]))
> break;
> cond_resched();
> @@ -2286,7 +2294,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
> * First take pages out of surplus state. Then make up the
> * remaining difference by allocating fresh huge pages.
> *
> - * We might race with __alloc_surplus_huge_page() here and be unable
> + * We might race with alloc_surplus_huge_page() here and be unable
> * to convert a surplus huge page to a normal huge page. That is
> * not critical, though, it just means the overall size of the
> * pool might be one hugepage larger than it needs to be, but
> @@ -2309,7 +2317,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
> /* yield cpu to avoid soft lockup */
> cond_resched();
>
> - ret = alloc_fresh_huge_page(h, nodes_allowed);
> + ret = alloc_pool_huge_page(h, nodes_allowed);
> spin_lock(&hugetlb_lock);
> if (!ret)
> goto out;
> @@ -2329,7 +2337,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
> * By placing pages into the surplus state independent of the
> * overcommit value, we are allowing the surplus pool size to
> * exceed overcommit. There are few sane options here. Since
> - * __alloc_surplus_huge_page() is checking the global counter,
> + * alloc_surplus_huge_page() is checking the global counter,
> * though, we'll note that we're not allowed to exceed surplus
> * and won't grow the pool anywhere else. Not until one of the
> * sysctls are changed, or the surplus pages go out of use.
>

2017-12-15 09:33:33

by Michal Hocko

[permalink] [raw]
Subject: Re: [RFC PATCH 0/5] mm, hugetlb: allocation API and migration improvements

Naoya,
this has passed Mike's review (thanks for that!), you have mentioned
that you can pass this through your testing machinery earlier. While
I've done some testing already I would really appreciate if you could
do that as well. Review would be highly appreciated as well.

Thanks!

On Mon 04-12-17 15:01:12, Michal Hocko wrote:
> Hi,
> this is a follow up for [1] for the allocation API and [2] for the
> hugetlb migration. It wasn't really easy to split those into two
> separate patch series as they share some code.
>
> My primary motivation to touch this code is to make the gigantic pages
> migration working. The giga pages allocation code is just too fragile
> and hacked into the hugetlb code now. This series tries to move giga
> pages closer to the first class citizen. We are not there yet but having
> 5 patches is quite a lot already and it will already make the code much
> easier to follow. I will come with other changes on top after this sees
> some review.
>
> The first two patches should be trivial to review. The third patch
> changes the way how we migrate huge pages. Newly allocated pages are a
> subject of the overcommit check and they participate surplus accounting
> which is quite unfortunate as the changelog explains. This patch doesn't
> change anything wrt. giga pages.
> Patch #4 removes the surplus accounting hack from
> __alloc_surplus_huge_page. I hope I didn't miss anything there and a
> deeper review is really due there.
> Patch #5 finally unifies allocation paths and giga pages shouldn't be
> any special anymore. There is also some renaming going on as well.
>
> Shortlog
> Michal Hocko (5):
> mm, hugetlb: unify core page allocation accounting and initialization
> mm, hugetlb: integrate giga hugetlb more naturally to the allocation path
> mm, hugetlb: do not rely on overcommit limit during migration
> mm, hugetlb: get rid of surplus page accounting tricks
> mm, hugetlb: further simplify hugetlb allocation API
>
> Diffstat:
> include/linux/hugetlb.h | 3 +
> mm/hugetlb.c | 305 +++++++++++++++++++++++++++---------------------
> mm/migrate.c | 3 +-
> 3 files changed, 175 insertions(+), 136 deletions(-)
>
>
> [1] http://lkml.kernel.org/r/[email protected]
> [2] http://lkml.kernel.org/r/[email protected]
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

--
Michal Hocko
SUSE Labs

2017-12-20 05:35:25

by Naoya Horiguchi

[permalink] [raw]
Subject: Re: [RFC PATCH 0/5] mm, hugetlb: allocation API and migration improvements


On 12/15/2017 06:33 PM, Michal Hocko wrote:
> Naoya,
> this has passed Mike's review (thanks for that!), you have mentioned
> that you can pass this through your testing machinery earlier. While
> I've done some testing already I would really appreciate if you could
> do that as well. Review would be highly appreciated as well.

Sorry for my slow response. I reviewed/tested this patchset and looks
good to me overall.

I have one comment on the code path from mbind(2).
The callback passed to migrate_pages() in do_mbind() (i.e. new_page())
calls alloc_huge_page_noerr() which currently doesn't call SetPageHugeTemporary(),
so hugetlb migration fails when h->surplus_huge_page >= h->nr_overcommit_huge_pages.

I don't think this is a bug, but it would be better if mbind(2) works
more similarly with other migration callers like move_pages(2)/migrate_pages(2).

Thanks,
Naoya Horiguchi


>
> Thanks!
>
> On Mon 04-12-17 15:01:12, Michal Hocko wrote:
>> Hi,
>> this is a follow up for [1] for the allocation API and [2] for the
>> hugetlb migration. It wasn't really easy to split those into two
>> separate patch series as they share some code.
>>
>> My primary motivation to touch this code is to make the gigantic pages
>> migration working. The giga pages allocation code is just too fragile
>> and hacked into the hugetlb code now. This series tries to move giga
>> pages closer to the first class citizen. We are not there yet but having
>> 5 patches is quite a lot already and it will already make the code much
>> easier to follow. I will come with other changes on top after this sees
>> some review.
>>
>> The first two patches should be trivial to review. The third patch
>> changes the way how we migrate huge pages. Newly allocated pages are a
>> subject of the overcommit check and they participate surplus accounting
>> which is quite unfortunate as the changelog explains. This patch doesn't
>> change anything wrt. giga pages.
>> Patch #4 removes the surplus accounting hack from
>> __alloc_surplus_huge_page. I hope I didn't miss anything there and a
>> deeper review is really due there.
>> Patch #5 finally unifies allocation paths and giga pages shouldn't be
>> any special anymore. There is also some renaming going on as well.
>>
>> Shortlog
>> Michal Hocko (5):
>> mm, hugetlb: unify core page allocation accounting and initialization
>> mm, hugetlb: integrate giga hugetlb more naturally to the allocation path
>> mm, hugetlb: do not rely on overcommit limit during migration
>> mm, hugetlb: get rid of surplus page accounting tricks
>> mm, hugetlb: further simplify hugetlb allocation API
>>
>> Diffstat:
>> include/linux/hugetlb.h | 3 +
>> mm/hugetlb.c | 305 +++++++++++++++++++++++++++---------------------
>> mm/migrate.c | 3 +-
>> 3 files changed, 175 insertions(+), 136 deletions(-)
>>
>>
>> [1] http://lkml.kernel.org/r/[email protected]
>> [2] http://lkml.kernel.org/r/[email protected]
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to [email protected]. For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>

2017-12-20 09:53:34

by Michal Hocko

[permalink] [raw]
Subject: Re: [RFC PATCH 0/5] mm, hugetlb: allocation API and migration improvements

On Wed 20-12-17 05:33:36, Naoya Horiguchi wrote:
>
> On 12/15/2017 06:33 PM, Michal Hocko wrote:
> > Naoya,
> > this has passed Mike's review (thanks for that!), you have mentioned
> > that you can pass this through your testing machinery earlier. While
> > I've done some testing already I would really appreciate if you could
> > do that as well. Review would be highly appreciated as well.
>
> Sorry for my slow response. I reviewed/tested this patchset and looks
> good to me overall.

No need to feel sorry. This doesn't have an urgent priority. Thanks for
the review and testing. Can I assume your {Reviewed,Acked}-by or
Tested-by?

> I have one comment on the code path from mbind(2).
> The callback passed to migrate_pages() in do_mbind() (i.e. new_page())
> calls alloc_huge_page_noerr() which currently doesn't call SetPageHugeTemporary(),
> so hugetlb migration fails when h->surplus_huge_page >= h->nr_overcommit_huge_pages.

Yes, I am aware of that. I should have been more explicit in the
changelog. Sorry about that and thanks for pointing it out explicitly.
To be honest I wasn't really sure what to do about this. The code path
is really complex and it made my head spin. I fail to see why we have to
call alloc_huge_page and mess with reservations at all.

> I don't think this is a bug, but it would be better if mbind(2) works
> more similarly with other migration callers like move_pages(2)/migrate_pages(2).

If the fix is as easy as the following I will add it to the pile.
Otherwise I would prefer to do this separately after I find some more
time to understand the callpath.
---
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e035002d3fb6..08a4af411e25 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -345,10 +345,9 @@ struct huge_bootmem_page {
struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve);
struct page *alloc_huge_page_node(struct hstate *h, int nid);
-struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
- unsigned long addr, int avoid_reserve);
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask);
+struct page *alloc_huge_page_vma(struct vm_area_struct *vma, unsigned long address);
int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t idx);

@@ -526,7 +525,7 @@ struct hstate {};
#define alloc_huge_page(v, a, r) NULL
#define alloc_huge_page_node(h, nid) NULL
#define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL
-#define alloc_huge_page_noerr(v, a, r) NULL
+#define alloc_huge_page_vma(vma, address) NULL
#define alloc_bootmem_huge_page(h) NULL
#define hstate_file(f) NULL
#define hstate_sizelog(s) NULL
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4426c5b23a20..e00deabe6d17 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1672,6 +1672,25 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
}

+/* mempolicy aware migration callback */
+struct page *alloc_huge_page_vma(struct vm_area_struct *vma, unsigned long address)
+{
+ struct mempolicy *mpol;
+ nodemask_t *nodemask;
+ struct page *page;
+ struct hstate *h;
+ gfp_t gfp_mask;
+ int node;
+
+ h = hstate_vma(vma);
+ gfp_mask = htlb_alloc_mask(h);
+ node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
+ page = alloc_huge_page_nodemask(h, node, nodemask);
+ mpol_cond_put(mpol);
+
+ return page;
+}
+
/*
* Increase the hugetlb pool such that it can accommodate a reservation
* of size 'delta'.
@@ -2077,20 +2096,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
return ERR_PTR(-ENOSPC);
}

-/*
- * alloc_huge_page()'s wrapper which simply returns the page if allocation
- * succeeds, otherwise NULL. This function is called from new_vma_page(),
- * where no ERR_VALUE is expected to be returned.
- */
-struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
- unsigned long addr, int avoid_reserve)
-{
- struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
- if (IS_ERR(page))
- page = NULL;
- return page;
-}
-
int alloc_bootmem_huge_page(struct hstate *h)
__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
int __alloc_bootmem_huge_page(struct hstate *h)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f604b22ebb65..96823fa07f38 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1121,8 +1121,7 @@ static struct page *new_page(struct page *page, unsigned long start, int **x)
}

if (PageHuge(page)) {
- BUG_ON(!vma);
- return alloc_huge_page_noerr(vma, address, 1);
+ return alloc_huge_page_vma(vma, address);
} else if (thp_migration_supported() && PageTransHuge(page)) {
struct page *thp;

--
Michal Hocko
SUSE Labs

2017-12-20 22:43:15

by Mike Kravetz

[permalink] [raw]
Subject: Re: [RFC PATCH 0/5] mm, hugetlb: allocation API and migration improvements

On 12/20/2017 01:53 AM, Michal Hocko wrote:
> On Wed 20-12-17 05:33:36, Naoya Horiguchi wrote:
>> I have one comment on the code path from mbind(2).
>> The callback passed to migrate_pages() in do_mbind() (i.e. new_page())
>> calls alloc_huge_page_noerr() which currently doesn't call SetPageHugeTemporary(),
>> so hugetlb migration fails when h->surplus_huge_page >= h->nr_overcommit_huge_pages.
>
> Yes, I am aware of that. I should have been more explicit in the
> changelog. Sorry about that and thanks for pointing it out explicitly.
> To be honest I wasn't really sure what to do about this. The code path
> is really complex and it made my head spin. I fail to see why we have to
> call alloc_huge_page and mess with reservations at all.

Oops! I missed that in my review.

Since alloc_huge_page was called with avoid_reserve == 1, it should not
do anything with reserve counts. One potential issue with the existing
code is cgroup accounting done by alloc_huge_page. When the new target
page is allocated, it is charged against the cgroup even though the original
page is still accounted for. If we are 'at the cgroup limit', the migration
may fail because of this.

I like your new code below as it explicitly takes reserve and cgroup
accounting out of the picture for migration. Let me think about it
for another day before providing a Reviewed-by.

--
Mike Kravetz

>> I don't think this is a bug, but it would be better if mbind(2) works
>> more similarly with other migration callers like move_pages(2)/migrate_pages(2).
>
> If the fix is as easy as the following I will add it to the pile.
> Otherwise I would prefer to do this separately after I find some more
> time to understand the callpath.
> ---
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index e035002d3fb6..08a4af411e25 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -345,10 +345,9 @@ struct huge_bootmem_page {
> struct page *alloc_huge_page(struct vm_area_struct *vma,
> unsigned long addr, int avoid_reserve);
> struct page *alloc_huge_page_node(struct hstate *h, int nid);
> -struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
> - unsigned long addr, int avoid_reserve);
> struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
> nodemask_t *nmask);
> +struct page *alloc_huge_page_vma(struct vm_area_struct *vma, unsigned long address);
> int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
> pgoff_t idx);
>
> @@ -526,7 +525,7 @@ struct hstate {};
> #define alloc_huge_page(v, a, r) NULL
> #define alloc_huge_page_node(h, nid) NULL
> #define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL
> -#define alloc_huge_page_noerr(v, a, r) NULL
> +#define alloc_huge_page_vma(vma, address) NULL
> #define alloc_bootmem_huge_page(h) NULL
> #define hstate_file(f) NULL
> #define hstate_sizelog(s) NULL
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 4426c5b23a20..e00deabe6d17 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1672,6 +1672,25 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
> return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
> }
>
> +/* mempolicy aware migration callback */
> +struct page *alloc_huge_page_vma(struct vm_area_struct *vma, unsigned long address)
> +{
> + struct mempolicy *mpol;
> + nodemask_t *nodemask;
> + struct page *page;
> + struct hstate *h;
> + gfp_t gfp_mask;
> + int node;
> +
> + h = hstate_vma(vma);
> + gfp_mask = htlb_alloc_mask(h);
> + node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
> + page = alloc_huge_page_nodemask(h, node, nodemask);
> + mpol_cond_put(mpol);
> +
> + return page;
> +}
> +
> /*
> * Increase the hugetlb pool such that it can accommodate a reservation
> * of size 'delta'.
> @@ -2077,20 +2096,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
> return ERR_PTR(-ENOSPC);
> }
>
> -/*
> - * alloc_huge_page()'s wrapper which simply returns the page if allocation
> - * succeeds, otherwise NULL. This function is called from new_vma_page(),
> - * where no ERR_VALUE is expected to be returned.
> - */
> -struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
> - unsigned long addr, int avoid_reserve)
> -{
> - struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
> - if (IS_ERR(page))
> - page = NULL;
> - return page;
> -}
> -
> int alloc_bootmem_huge_page(struct hstate *h)
> __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
> int __alloc_bootmem_huge_page(struct hstate *h)
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index f604b22ebb65..96823fa07f38 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -1121,8 +1121,7 @@ static struct page *new_page(struct page *page, unsigned long start, int **x)
> }
>
> if (PageHuge(page)) {
> - BUG_ON(!vma);
> - return alloc_huge_page_noerr(vma, address, 1);
> + return alloc_huge_page_vma(vma, address);
> } else if (thp_migration_supported() && PageTransHuge(page)) {
> struct page *thp;
>
>

2017-12-21 07:28:09

by Michal Hocko

[permalink] [raw]
Subject: Re: [RFC PATCH 0/5] mm, hugetlb: allocation API and migration improvements

On Wed 20-12-17 14:43:03, Mike Kravetz wrote:
> On 12/20/2017 01:53 AM, Michal Hocko wrote:
> > On Wed 20-12-17 05:33:36, Naoya Horiguchi wrote:
> >> I have one comment on the code path from mbind(2).
> >> The callback passed to migrate_pages() in do_mbind() (i.e. new_page())
> >> calls alloc_huge_page_noerr() which currently doesn't call SetPageHugeTemporary(),
> >> so hugetlb migration fails when h->surplus_huge_page >= h->nr_overcommit_huge_pages.
> >
> > Yes, I am aware of that. I should have been more explicit in the
> > changelog. Sorry about that and thanks for pointing it out explicitly.
> > To be honest I wasn't really sure what to do about this. The code path
> > is really complex and it made my head spin. I fail to see why we have to
> > call alloc_huge_page and mess with reservations at all.
>
> Oops! I missed that in my review.
>
> Since alloc_huge_page was called with avoid_reserve == 1, it should not
> do anything with reserve counts. One potential issue with the existing
> code is cgroup accounting done by alloc_huge_page. When the new target
> page is allocated, it is charged against the cgroup even though the original
> page is still accounted for. If we are 'at the cgroup limit', the migration
> may fail because of this.

Yeah, the existing code seems just broken. I strongly suspect that the
allocation API for hugetlb was so complicated that this was just a
natural result of a confusion with some follow up changes on top.

> I like your new code below as it explicitly takes reserve and cgroup
> accounting out of the picture for migration. Let me think about it
> for another day before providing a Reviewed-by.

Thanks a lot!
--
Michal Hocko
SUSE Labs

2017-12-21 23:35:38

by Mike Kravetz

[permalink] [raw]
Subject: Re: [RFC PATCH 0/5] mm, hugetlb: allocation API and migration improvements

On 12/20/2017 11:28 PM, Michal Hocko wrote:
> On Wed 20-12-17 14:43:03, Mike Kravetz wrote:
>> On 12/20/2017 01:53 AM, Michal Hocko wrote:
>>> On Wed 20-12-17 05:33:36, Naoya Horiguchi wrote:
>>>> I have one comment on the code path from mbind(2).
>>>> The callback passed to migrate_pages() in do_mbind() (i.e. new_page())
>>>> calls alloc_huge_page_noerr() which currently doesn't call SetPageHugeTemporary(),
>>>> so hugetlb migration fails when h->surplus_huge_page >= h->nr_overcommit_huge_pages.
>>>
>>> Yes, I am aware of that. I should have been more explicit in the
>>> changelog. Sorry about that and thanks for pointing it out explicitly.
>>> To be honest I wasn't really sure what to do about this. The code path
>>> is really complex and it made my head spin. I fail to see why we have to
>>> call alloc_huge_page and mess with reservations at all.
>>
>> Oops! I missed that in my review.
>>
>> Since alloc_huge_page was called with avoid_reserve == 1, it should not
>> do anything with reserve counts. One potential issue with the existing
>> code is cgroup accounting done by alloc_huge_page. When the new target
>> page is allocated, it is charged against the cgroup even though the original
>> page is still accounted for. If we are 'at the cgroup limit', the migration
>> may fail because of this.
>
> Yeah, the existing code seems just broken. I strongly suspect that the
> allocation API for hugetlb was so complicated that this was just a
> natural result of a confusion with some follow up changes on top.
>
>> I like your new code below as it explicitly takes reserve and cgroup
>> accounting out of the picture for migration. Let me think about it
>> for another day before providing a Reviewed-by.
>
> Thanks a lot!

You can add,

Reviewed-by: Mike Kravetz <[email protected]>

I had some concerns about transferring huge page state during migration
not specific to this patch, so I did a bunch of testing.

--
Mike Kravetz

2017-12-22 09:00:09

by Naoya Horiguchi

[permalink] [raw]
Subject: Re: [RFC PATCH 0/5] mm, hugetlb: allocation API and migration improvements

On 12/20/2017 06:53 PM, Michal Hocko wrote:
> On Wed 20-12-17 05:33:36, Naoya Horiguchi wrote:
>>
>> On 12/15/2017 06:33 PM, Michal Hocko wrote:
>>> Naoya,
>>> this has passed Mike's review (thanks for that!), you have mentioned
>>> that you can pass this through your testing machinery earlier. While
>>> I've done some testing already I would really appreciate if you could
>>> do that as well. Review would be highly appreciated as well.
>>
>> Sorry for my slow response. I reviewed/tested this patchset and looks
>> good to me overall.
>
> No need to feel sorry. This doesn't have an urgent priority. Thanks for
> the review and testing. Can I assume your {Reviewed,Acked}-by or
> Tested-by?
>

Yes, I tested again with additional changes below, and hugetlb migration
works fine from mbind(2). Thank you very much for your work.

Reviewed-by: Naoya Horiguchi <[email protected]>

for the series.

Thanks,
Naoya Horiguchi

>> I have one comment on the code path from mbind(2).
>> The callback passed to migrate_pages() in do_mbind() (i.e. new_page())
>> calls alloc_huge_page_noerr() which currently doesn't call SetPageHugeTemporary(),
>> so hugetlb migration fails when h->surplus_huge_page >= h->nr_overcommit_huge_pages.
>
> Yes, I am aware of that. I should have been more explicit in the
> changelog. Sorry about that and thanks for pointing it out explicitly.
> To be honest I wasn't really sure what to do about this. The code path
> is really complex and it made my head spin. I fail to see why we have to
> call alloc_huge_page and mess with reservations at all.
>
>> I don't think this is a bug, but it would be better if mbind(2) works
>> more similarly with other migration callers like move_pages(2)/migrate_pages(2).
>
> If the fix is as easy as the following I will add it to the pile.
> Otherwise I would prefer to do this separately after I find some more
> time to understand the callpath.
> ---
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index e035002d3fb6..08a4af411e25 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -345,10 +345,9 @@ struct huge_bootmem_page {
> struct page *alloc_huge_page(struct vm_area_struct *vma,
> unsigned long addr, int avoid_reserve);
> struct page *alloc_huge_page_node(struct hstate *h, int nid);
> -struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
> - unsigned long addr, int avoid_reserve);
> struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
> nodemask_t *nmask);
> +struct page *alloc_huge_page_vma(struct vm_area_struct *vma, unsigned long address);
> int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
> pgoff_t idx);
>
> @@ -526,7 +525,7 @@ struct hstate {};
> #define alloc_huge_page(v, a, r) NULL
> #define alloc_huge_page_node(h, nid) NULL
> #define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL
> -#define alloc_huge_page_noerr(v, a, r) NULL
> +#define alloc_huge_page_vma(vma, address) NULL
> #define alloc_bootmem_huge_page(h) NULL
> #define hstate_file(f) NULL
> #define hstate_sizelog(s) NULL
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 4426c5b23a20..e00deabe6d17 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1672,6 +1672,25 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
> return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
> }
>
> +/* mempolicy aware migration callback */
> +struct page *alloc_huge_page_vma(struct vm_area_struct *vma, unsigned long address)
> +{
> + struct mempolicy *mpol;
> + nodemask_t *nodemask;
> + struct page *page;
> + struct hstate *h;
> + gfp_t gfp_mask;
> + int node;
> +
> + h = hstate_vma(vma);
> + gfp_mask = htlb_alloc_mask(h);
> + node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
> + page = alloc_huge_page_nodemask(h, node, nodemask);
> + mpol_cond_put(mpol);
> +
> + return page;
> +}
> +
> /*
> * Increase the hugetlb pool such that it can accommodate a reservation
> * of size 'delta'.
> @@ -2077,20 +2096,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
> return ERR_PTR(-ENOSPC);
> }
>
> -/*
> - * alloc_huge_page()'s wrapper which simply returns the page if allocation
> - * succeeds, otherwise NULL. This function is called from new_vma_page(),
> - * where no ERR_VALUE is expected to be returned.
> - */
> -struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
> - unsigned long addr, int avoid_reserve)
> -{
> - struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
> - if (IS_ERR(page))
> - page = NULL;
> - return page;
> -}
> -
> int alloc_bootmem_huge_page(struct hstate *h)
> __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
> int __alloc_bootmem_huge_page(struct hstate *h)
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index f604b22ebb65..96823fa07f38 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -1121,8 +1121,7 @@ static struct page *new_page(struct page *page, unsigned long start, int **x)
> }
>
> if (PageHuge(page)) {
> - BUG_ON(!vma);
> - return alloc_huge_page_noerr(vma, address, 1);
> + return alloc_huge_page_vma(vma, address);
> } else if (thp_migration_supported() && PageTransHuge(page)) {
> struct page *thp;
>
>

2017-12-22 09:48:58

by Michal Hocko

[permalink] [raw]
Subject: Re: [RFC PATCH 0/5] mm, hugetlb: allocation API and migration improvements

On Thu 21-12-17 15:35:28, Mike Kravetz wrote:
[...]
> You can add,
>
> Reviewed-by: Mike Kravetz <[email protected]>
>
> I had some concerns about transferring huge page state during migration
> not specific to this patch, so I did a bunch of testing.

On Fri 22-12-17 08:58:48, Naoya Horiguchi wrote:
[...]
> Yes, I tested again with additional changes below, and hugetlb migration
> works fine from mbind(2). Thank you very much for your work.
>
> Reviewed-by: Naoya Horiguchi <[email protected]>
>
> for the series.

Thanks a lot to both of you! I have added the changelog to the last
patch. I am currently busy as hell so I will unlikely send the whole
thing before new year but please double check the changelog if you find
some more time.
---
>From 65e7dda5dfe90fa656285729a270855e93637caa Mon Sep 17 00:00:00 2001
From: Michal Hocko <[email protected]>
Date: Fri, 22 Dec 2017 10:31:18 +0100
Subject: [PATCH] hugetlb, mempolicy: fix the mbind hugetlb migration

do_mbind migration code relies on alloc_huge_page_noerr for hugetlb
pages. alloc_huge_page_noerr uses alloc_huge_page which is a highlevel
allocation function which has to take care of reserves, overcommit or
hugetlb cgroup accounting. None of that is really required for the page
migration because the new page is only temporal and either will replace
the original page or it will be dropped. This is essentially as for
other migration call paths and there shouldn't be any reason to handle
mbind in a special way.

The current implementation is even suboptimal because the migration
might fail just because the hugetlb cgroup limit is reached, or the
overcommit is saturated.

Fix this by making mbind like other hugetlb migration paths. Add
a new migration helper alloc_huge_page_vma as a wrapper around
alloc_huge_page_nodemask with additional mempolicy handling.

alloc_huge_page_noerr has no more users and it can go.

Reviewed-by: Mike Kravetz <[email protected]>
Reviewed-by: Naoya Horiguchi <[email protected]>
Signed-off-by: Michal Hocko <[email protected]>
---
include/linux/hugetlb.h | 5 ++---
mm/hugetlb.c | 33 +++++++++++++++++++--------------
mm/mempolicy.c | 3 +--
3 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e035002d3fb6..08a4af411e25 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -345,10 +345,9 @@ struct huge_bootmem_page {
struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve);
struct page *alloc_huge_page_node(struct hstate *h, int nid);
-struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
- unsigned long addr, int avoid_reserve);
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask);
+struct page *alloc_huge_page_vma(struct vm_area_struct *vma, unsigned long address);
int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t idx);

@@ -526,7 +525,7 @@ struct hstate {};
#define alloc_huge_page(v, a, r) NULL
#define alloc_huge_page_node(h, nid) NULL
#define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL
-#define alloc_huge_page_noerr(v, a, r) NULL
+#define alloc_huge_page_vma(vma, address) NULL
#define alloc_bootmem_huge_page(h) NULL
#define hstate_file(f) NULL
#define hstate_sizelog(s) NULL
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4426c5b23a20..e00deabe6d17 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1672,6 +1672,25 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
}

+/* mempolicy aware migration callback */
+struct page *alloc_huge_page_vma(struct vm_area_struct *vma, unsigned long address)
+{
+ struct mempolicy *mpol;
+ nodemask_t *nodemask;
+ struct page *page;
+ struct hstate *h;
+ gfp_t gfp_mask;
+ int node;
+
+ h = hstate_vma(vma);
+ gfp_mask = htlb_alloc_mask(h);
+ node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
+ page = alloc_huge_page_nodemask(h, node, nodemask);
+ mpol_cond_put(mpol);
+
+ return page;
+}
+
/*
* Increase the hugetlb pool such that it can accommodate a reservation
* of size 'delta'.
@@ -2077,20 +2096,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
return ERR_PTR(-ENOSPC);
}

-/*
- * alloc_huge_page()'s wrapper which simply returns the page if allocation
- * succeeds, otherwise NULL. This function is called from new_vma_page(),
- * where no ERR_VALUE is expected to be returned.
- */
-struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
- unsigned long addr, int avoid_reserve)
-{
- struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
- if (IS_ERR(page))
- page = NULL;
- return page;
-}
-
int alloc_bootmem_huge_page(struct hstate *h)
__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
int __alloc_bootmem_huge_page(struct hstate *h)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f604b22ebb65..96823fa07f38 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1121,8 +1121,7 @@ static struct page *new_page(struct page *page, unsigned long start, int **x)
}

if (PageHuge(page)) {
- BUG_ON(!vma);
- return alloc_huge_page_noerr(vma, address, 1);
+ return alloc_huge_page_vma(vma, address);
} else if (thp_migration_supported() && PageTransHuge(page)) {
struct page *thp;

--
2.15.1

--
Michal Hocko
SUSE Labs