Hi,
As discussed in previous thread [1], there is an inconsistency when handling
hugetlb migration. When handling the migration of freed hugetlb, it prevents
fallback to other NUMA nodes in alloc_and_dissolve_hugetlb_folio(). However,
when dealing with in-use hugetlb, it allows fallback to other NUMA nodes in
alloc_hugetlb_folio_nodemask(), which can break the per-node hugetlb pool
and might result in unexpected failures when node bound workloads doesn't get
what is asssumed available.
This patch set tries to make the hugetlb migration strategy more clear
and consistent. Please find details in each patch.
[1] https://lore.kernel.org/all/6f26ce22d2fcd523418a085f2c588fe0776d46e7.1706794035.git.baolin.wang@linux.alibaba.com/
Baolin Wang (3):
mm: record the migration reason for struct migration_target_control
mm: hugetlb: make the hugetlb migration strategy consistent
docs: hugetlbpage.rst: add hugetlb migration description
Documentation/admin-guide/mm/hugetlbpage.rst | 7 +++++
include/linux/hugetlb.h | 4 +--
mm/gup.c | 1 +
mm/hugetlb.c | 28 ++++++++++++++++++--
mm/internal.h | 1 +
mm/memory-failure.c | 1 +
mm/memory_hotplug.c | 1 +
mm/mempolicy.c | 3 ++-
mm/migrate.c | 3 ++-
mm/page_alloc.c | 1 +
mm/vmscan.c | 3 ++-
11 files changed, 46 insertions(+), 7 deletions(-)
--
2.39.3
As discussed in previous thread [1], there is an inconsistency when handing
hugetlb migration. When handling the migration of freed hugetlb, it prevents
fallback to other NUMA nodes in alloc_and_dissolve_hugetlb_folio(). However,
when dealing with in-use hugetlb, it allows fallback to other NUMA nodes in
alloc_hugetlb_folio_nodemask(), which can break the per-node hugetlb pool
and might result in unexpected failures when node bound workloads doesn't get
what is asssumed available.
To make hugetlb migration strategy more clear, we should list all the scenarios
of hugetlb migration and analyze whether allocation fallback is permitted:
1) Memory offline: will call dissolve_free_huge_pages() to free the freed hugetlb,
and call do_migrate_range() to migrate the in-use hugetlb. Both can break the
per-node hugetlb pool, but as this is an explicit offlining operation, no better
choice. So should allow the hugetlb allocation fallback.
2) Memory failure: same as memory offline. Should allow fallback to a different node
might be the only option to handle it, otherwise the impact of poisoned memory can
be amplified.
3) Longterm pinning: will call migrate_longterm_unpinnable_pages() to migrate in-use
and not-longterm-pinnable hugetlb, which can break the per-node pool. But we should
fail to longterm pinning if can not allocate on current node to avoid breaking the
per-node pool.
4) Syscalls (mbind, migrate_pages, move_pages): these are explicit users operation
to move pages to other nodes, so fallback to other nodes should not be prohibited.
5) alloc_contig_range: used by CMA allocation and virtio-mem fake-offline to allocate
given range of pages. Now the freed hugetlb migration is not allowed to fallback, to
keep consistency, the in-use hugetlb migration should be also not allowed to fallback.
6) alloc_contig_pages: used by kfence, pgtable_debug etc. The strategy should be
consistent with that of alloc_contig_range().
Based on the analysis of the various scenarios above, determine whether fallback is
permitted according to the migration reason in alloc_hugetlb_folio_nodemask().
[1] https://lore.kernel.org/all/6f26ce22d2fcd523418a085f2c588fe0776d46e7.1706794035.git.baolin.wang@linux.alibaba.com/
Signed-off-by: Baolin Wang <[email protected]>
---
include/linux/hugetlb.h | 4 ++--
mm/hugetlb.c | 28 ++++++++++++++++++++++++++--
mm/mempolicy.c | 2 +-
mm/migrate.c | 2 +-
4 files changed, 30 insertions(+), 6 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 77b30a8c6076..fa122dc509cf 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -747,7 +747,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve);
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
- nodemask_t *nmask, gfp_t gfp_mask);
+ nodemask_t *nmask, gfp_t gfp_mask, int reason);
int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
pgoff_t idx);
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
@@ -1065,7 +1065,7 @@ static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
static inline struct folio *
alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
- nodemask_t *nmask, gfp_t gfp_mask)
+ nodemask_t *nmask, gfp_t gfp_mask, int reason)
{
return NULL;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 68283e54c899..a55cfc7844bc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2621,8 +2621,10 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h,
/* folio migration callback function */
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
- nodemask_t *nmask, gfp_t gfp_mask)
+ nodemask_t *nmask, gfp_t gfp_mask, int reason)
{
+ bool allowed_fallback = false;
+
spin_lock_irq(&hugetlb_lock);
if (available_huge_pages(h)) {
struct folio *folio;
@@ -2636,6 +2638,28 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
}
spin_unlock_irq(&hugetlb_lock);
+ if (gfp_mask & __GFP_THISNODE)
+ goto alloc_new;
+
+ /*
+ * Note: the memory offline, memory failure and migration syscalls can break
+ * the per-node hugetlb pool. Other cases can not allocate new hugetlb on
+ * other nodes.
+ */
+ switch (reason) {
+ case MR_MEMORY_HOTPLUG:
+ case MR_MEMORY_FAILURE:
+ case MR_SYSCALL:
+ case MR_MEMPOLICY_MBIND:
+ allowed_fallback = true;
+ break;
+ default:
+ break;
+ }
+
+ if (!allowed_fallback)
+ gfp_mask |= __GFP_THISNODE;
+alloc_new:
return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
}
@@ -6666,7 +6690,7 @@ static struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
gfp_mask = htlb_alloc_mask(h);
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
- folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask);
+ folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask, -1);
mpol_cond_put(mpol);
return folio;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 98ceb12e0e17..436e817eeaeb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1228,7 +1228,7 @@ static struct folio *alloc_migration_target_by_mpol(struct folio *src,
h = folio_hstate(src);
gfp = htlb_alloc_mask(h);
nodemask = policy_nodemask(gfp, pol, ilx, &nid);
- return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp);
+ return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp, MR_MEMPOLICY_MBIND);
}
if (folio_test_large(src))
diff --git a/mm/migrate.c b/mm/migrate.c
index bde63010a3cf..0c2b70800da3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2022,7 +2022,7 @@ struct folio *alloc_migration_target(struct folio *src, unsigned long private)
gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
return alloc_hugetlb_folio_nodemask(h, nid,
- mtc->nmask, gfp_mask);
+ mtc->nmask, gfp_mask, mtc->reason);
}
if (folio_test_large(src)) {
--
2.39.3
Add some description of the hugetlb migration strategy.
Signed-off-by: Baolin Wang <[email protected]>
---
Documentation/admin-guide/mm/hugetlbpage.rst | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst
index e4d4b4a8dc97..68d7bc2165c9 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -376,6 +376,13 @@ Note that the number of overcommit and reserve pages remain global quantities,
as we don't know until fault time, when the faulting task's mempolicy is
applied, from which node the huge page allocation will be attempted.
+The hugetlb may be migrated between the per-node hugepages pool in the following
+scenarios: memory offline, memory failure, longterm pinning, syscalls(mbind,
+migrate_pages, move_pages), alloc_contig_range() and alloc_contig_pages(). Now
+only memory offline, memory failure and syscalls allow fallback to allocate a
+new hugetlb on a different node if the current node is unable to allocate during
+hugetlb migration, that means these 3 cases can break the per-node hugepages pool.
+
.. _using_huge_pages:
Using Huge Pages
--
2.39.3
To support different hugetlb allocation strategies during hugetlb migration
based on various migration reasons, recording the migration reason for the
migration_target_control structure as a preparation.
Signed-off-by: Baolin Wang <[email protected]>
---
mm/gup.c | 1 +
mm/internal.h | 1 +
mm/memory-failure.c | 1 +
mm/memory_hotplug.c | 1 +
mm/mempolicy.c | 1 +
mm/migrate.c | 1 +
mm/page_alloc.c | 1 +
mm/vmscan.c | 3 ++-
8 files changed, 9 insertions(+), 1 deletion(-)
diff --git a/mm/gup.c b/mm/gup.c
index df83182ec72d..959a1a05b059 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2132,6 +2132,7 @@ static int migrate_longterm_unpinnable_pages(
struct migration_target_control mtc = {
.nid = NUMA_NO_NODE,
.gfp_mask = GFP_USER | __GFP_NOWARN,
+ .reason = MR_LONGTERM_PIN,
};
if (migrate_pages(movable_page_list, alloc_migration_target,
diff --git a/mm/internal.h b/mm/internal.h
index 93e229112045..7677ee4d8e12 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -958,6 +958,7 @@ struct migration_target_control {
int nid; /* preferred node id */
nodemask_t *nmask;
gfp_t gfp_mask;
+ enum migrate_reason reason;
};
/*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9349948f1abf..780bb2aee0af 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -2666,6 +2666,7 @@ static int soft_offline_in_use_page(struct page *page)
struct migration_target_control mtc = {
.nid = NUMA_NO_NODE,
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ .reason = MR_MEMORY_FAILURE,
};
if (!huge && folio_test_large(folio)) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a444e2d7dd2b..b79ba36e09e0 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1841,6 +1841,7 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
struct migration_target_control mtc = {
.nmask = &nmask,
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ .reason = MR_MEMORY_HOTPLUG,
};
int ret;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f60b4c99f130..98ceb12e0e17 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1070,6 +1070,7 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest,
struct migration_target_control mtc = {
.nid = dest,
.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
+ .reason = MR_SYSCALL,
};
nodes_clear(nmask);
diff --git a/mm/migrate.c b/mm/migrate.c
index 73a052a382f1..bde63010a3cf 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2060,6 +2060,7 @@ static int do_move_pages_to_node(struct list_head *pagelist, int node)
struct migration_target_control mtc = {
.nid = node,
.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
+ .reason = MR_SYSCALL,
};
err = migrate_pages(pagelist, alloc_migration_target, NULL,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b0b92ce997dc..81ba73d77921 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6264,6 +6264,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
struct migration_target_control mtc = {
.nid = zone_to_nid(cc->zone),
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ .reason = MR_CONTIG_RANGE,
};
lru_cache_disable();
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 87df3a48bdd7..d111c8e3b40e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -978,7 +978,8 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
__GFP_NOMEMALLOC | GFP_NOWAIT,
.nid = target_nid,
- .nmask = &allowed_mask
+ .nmask = &allowed_mask,
+ .reason = MR_DEMOTION,
};
if (list_empty(demote_folios))
--
2.39.3
On Wed, Feb 21, 2024 at 05:27:54PM +0800, Baolin Wang wrote:
> Based on the analysis of the various scenarios above, determine whether fallback is
> permitted according to the migration reason in alloc_hugetlb_folio_nodemask().
Hi Baolin,
The high level reasoning makes sense to me, taking a step back and
thinking about all cases and possible outcomes makes sense to me.
I plan to look closer, but I something that caught my eye:
> }
> spin_unlock_irq(&hugetlb_lock);
>
> + if (gfp_mask & __GFP_THISNODE)
> + goto alloc_new;
> +
> + /*
> + * Note: the memory offline, memory failure and migration syscalls can break
> + * the per-node hugetlb pool. Other cases can not allocate new hugetlb on
> + * other nodes.
> + */
> + switch (reason) {
> + case MR_MEMORY_HOTPLUG:
> + case MR_MEMORY_FAILURE:
> + case MR_SYSCALL:
> + case MR_MEMPOLICY_MBIND:
> + allowed_fallback = true;
> + break;
> + default:
> + break;
> + }
> +
> + if (!allowed_fallback)
> + gfp_mask |= __GFP_THISNODE;
I think it would be better if instead of fiddling with gfp here,
have htlb_alloc_mask() have a second argument with the MR_reason,
do the switch there and enable GFP_THISNODE.
Then alloc_hugetlb_folio_nodemask() would already get the right mask.
I think that that might be more clear as it gets encapsulated in the
function that directly gives us the gfp.
Does that makes sense?
--
Oscar Salvador
SUSE Labs
On 2024/2/23 06:15, Oscar Salvador wrote:
> On Wed, Feb 21, 2024 at 05:27:54PM +0800, Baolin Wang wrote:
>> Based on the analysis of the various scenarios above, determine whether fallback is
>> permitted according to the migration reason in alloc_hugetlb_folio_nodemask().
>
> Hi Baolin,
>
> The high level reasoning makes sense to me, taking a step back and
> thinking about all cases and possible outcomes makes sense to me.
>
> I plan to look closer, but I something that caught my eye:
Thanks for reviewing.
>> }
>> spin_unlock_irq(&hugetlb_lock);
>>
>> + if (gfp_mask & __GFP_THISNODE)
>> + goto alloc_new;
>> +
>> + /*
>> + * Note: the memory offline, memory failure and migration syscalls can break
>> + * the per-node hugetlb pool. Other cases can not allocate new hugetlb on
>> + * other nodes.
>> + */
>> + switch (reason) {
>> + case MR_MEMORY_HOTPLUG:
>> + case MR_MEMORY_FAILURE:
>> + case MR_SYSCALL:
>> + case MR_MEMPOLICY_MBIND:
>> + allowed_fallback = true;
>> + break;
>> + default:
>> + break;
>> + }
>> +
>> + if (!allowed_fallback)
>> + gfp_mask |= __GFP_THISNODE;
>
> I think it would be better if instead of fiddling with gfp here,
> have htlb_alloc_mask() have a second argument with the MR_reason,
> do the switch there and enable GFP_THISNODE.
> Then alloc_hugetlb_folio_nodemask() would already get the right mask. >
> I think that that might be more clear as it gets encapsulated in the
> function that directly gives us the gfp.
>
> Does that makes sense?
I previously considered passing the MR_reason argument to the
htlb_modify_alloc_mask(), which is only used by hugetlb migration.
But in alloc_hugetlb_folio_nodemask(), if there are available hugetlb on
other nodes, we should allow migrating, that will not break the per-node
hugetlb pool.
That's why I just change the gfp_mask for allocating a new hguetlb when
migration, that can break the pool.
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int
preferred_nid,
nodemask_t *nmask, gfp_t gfp_mask)
{
spin_lock_irq(&hugetlb_lock);
if (available_huge_pages(h)) {
struct folio *folio;
folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
preferred_nid, nmask);
if (folio) {
spin_unlock_irq(&hugetlb_lock);
return folio;
}
}
.....
On Fri, Feb 23, 2024 at 10:56:48AM +0800, Baolin Wang wrote:
> I previously considered passing the MR_reason argument to the
> htlb_modify_alloc_mask(), which is only used by hugetlb migration.
> But in alloc_hugetlb_folio_nodemask(), if there are available hugetlb on
> other nodes, we should allow migrating, that will not break the per-node
> hugetlb pool.
>
> That's why I just change the gfp_mask for allocating a new hguetlb when
> migration, that can break the pool.
Code-wise I think this is good, but I'm having some feelings
about where filter out the mask.
Ok, I'm trying to get my head around this.
It's been a while since I looked into hugetlb code, so here we go.
You mentioned that the only reason not to fiddle with gfp_mask before calling
in alloc_hugetlb_folio_nodemask(), was that we might be able to find a hugetlb
page in another node, and that that's ok because since all nodes remain with
the same number of hugetlb pages, per-node pool doesn't get broken.
Now, I see that dequeue_hugetlb_folio_nodemask() first tries to get the zonelist
of the preferred node, and AFAICS, if it has !GFP_THISNODE, it should also
get the zonelists of all other nodes, so we might fallback.
In the hope of finding a way to be able to filter out in htlb_modify_alloc_mask(),
I was trying to see whether we could skip GFP_THISNODE in
dequeue_hugetlb_folio_nodemask() but no because we might end up dequeueing
a hugetlb which sits in another node, while we really specified __GFP_THISNODE.
The only way might be to somehow decouple dequeue_hugetlb_folio_nodemask()
from alloc_hugetlb_folio_nodemask() and do some kind of gfp modification
between the two calls.
Another thing I dislike is the "-1" in alloc_hugetlb_folio_vma().
I think at least it deserves a comment like "Passing -1 will make us stick
to GFP_THISNODE".
Although that is another thing, we will pass "-1" which forces GFP_THISNODE
when allocating a newly fresh hugetlb page, but in dequeue_hugetlb_folio_nodemask()
we might get a page from a different node.
That doesn't break per-node pool, but it is somehow odd?
--
Oscar Salvador
SUSE Labs
On 2024/2/23 22:19, Oscar Salvador wrote:
> On Fri, Feb 23, 2024 at 10:56:48AM +0800, Baolin Wang wrote:
>
>> I previously considered passing the MR_reason argument to the
>> htlb_modify_alloc_mask(), which is only used by hugetlb migration.
>> But in alloc_hugetlb_folio_nodemask(), if there are available hugetlb on
>> other nodes, we should allow migrating, that will not break the per-node
>> hugetlb pool.
>>
>> That's why I just change the gfp_mask for allocating a new hguetlb when
>> migration, that can break the pool.
>
> Code-wise I think this is good, but I'm having some feelings
> about where filter out the mask.
> Ok, I'm trying to get my head around this.
> It's been a while since I looked into hugetlb code, so here we go.
>
> You mentioned that the only reason not to fiddle with gfp_mask before calling
> in alloc_hugetlb_folio_nodemask(), was that we might be able to find a hugetlb
> page in another node, and that that's ok because since all nodes remain with
> the same number of hugetlb pages, per-node pool doesn't get broken.
>
> Now, I see that dequeue_hugetlb_folio_nodemask() first tries to get the zonelist
> of the preferred node, and AFAICS, if it has !GFP_THISNODE, it should also
> get the zonelists of all other nodes, so we might fallback.
Right.
> In the hope of finding a way to be able to filter out in htlb_modify_alloc_mask(),
> I was trying to see whether we could skip GFP_THISNODE in
> dequeue_hugetlb_folio_nodemask() but no because we might end up dequeueing
> a hugetlb which sits in another node, while we really specified __GFP_THISNODE.
>
> The only way might be to somehow decouple dequeue_hugetlb_folio_nodemask()
> from alloc_hugetlb_folio_nodemask() and do some kind of gfp modification
> between the two calls.
IMO, I'm not sure whether it's appropriate to decouple
dequeue_hugetlb_folio_nodemask() from alloc_hugetlb_folio_nodemask()
into two separate functions for the users to call, because these details
should be hidden within the hugetlb core implementation.
Instead, I can move the gfp_mask fiddling into a new helper, and move
the helper into alloc_migrate_hugetlb_folio(). Temporary hugetlb
allocation has its own gfp strategy seems reasonable to me.
> Another thing I dislike is the "-1" in alloc_hugetlb_folio_vma().
> I think at least it deserves a comment like "Passing -1 will make us stick
> to GFP_THISNODE".
Sure, will add some comments.
> Although that is another thing, we will pass "-1" which forces GFP_THISNODE
> when allocating a newly fresh hugetlb page, but in dequeue_hugetlb_folio_nodemask()
> we might get a page from a different node.
> That doesn't break per-node pool, but it is somehow odd?
Only hugetlb_mfill_atomic_pte() will use -1, which is used to allocate a
temporary hugetlb to hold the copied content that will be immediately
released if uffd copy completes (see commmit 8cc5fcbb5be8). Therefore,
here it is allowed to fallback to available hugetlb on other nodes, but
it is not allowed to break the per-node pool.
On 2024/2/26 17:17, Oscar Salvador wrote:
> On Mon, Feb 26, 2024 at 11:34:51AM +0800, Baolin Wang wrote:
>> IMO, I'm not sure whether it's appropriate to decouple
>> dequeue_hugetlb_folio_nodemask() from alloc_hugetlb_folio_nodemask() into
>> two separate functions for the users to call, because these details should
>> be hidden within the hugetlb core implementation.
>>
>> Instead, I can move the gfp_mask fiddling into a new helper, and move the
>> helper into alloc_migrate_hugetlb_folio(). Temporary hugetlb allocation has
>> its own gfp strategy seems reasonable to me.
>
> An alternative would be to do the following, which does not futher carry
> the "reason" argument into hugetlb code.
> (Not even compile tested, just a PoC)
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index c1ee640d87b1..8a89a1007dcb 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -970,6 +970,24 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
> return modified_mask;
> }
>
> +static inline bool htlb_allow_fallback(int reason)
> +{
> + bool allowed_fallback = false;
> +
> + switch (reason) {
> + case MR_MEMORY_HOTPLUG:
> + case MR_MEMORY_FAILURE:
> + case MR_SYSCALL:
> + case MR_MEMPOLICY_MBIND:
> + allowed_fallback = true;
> + break;
> + default:
> + break;
> + }
> +
> + return allowed_fallback;
> +}
> +
Thanks for providing an alternative implementation. However, I still
prefer to hide these details into hugetlb core, since users do not need
to pay excessive attention to these hugetlb details. So something like:
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 418d66953224..e8eb08bbc688 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2567,13 +2567,38 @@ static struct folio
*alloc_surplus_hugetlb_folio(struct hstate *h,
}
static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h,
gfp_t gfp_mask,
- int nid, nodemask_t *nmask)
+ int nid, nodemask_t *nmask, int reason)
{
struct folio *folio;
+ bool allowed_fallback = false;
if (hstate_is_gigantic(h))
return NULL;
+ if (gfp_mask & __GFP_THISNODE)
+ goto alloc_new;
+
+ /*
+ * Note: the memory offline, memory failure and migration
syscalls will
+ * be allowed to fallback to other nodes due to lack of a better
chioce,
+ * that might break the per-node hugetlb pool. While other cases
will
+ * set the __GFP_THISNODE to avoid breaking the per-node hugetlb
pool.
+ */
+ switch (reason) {
+ case MR_MEMORY_HOTPLUG:
+ case MR_MEMORY_FAILURE:
+ case MR_MEMORY_FAILURE:
+ case MR_SYSCALL:
+ case MR_MEMPOLICY_MBIND:
+ allowed_fallback = true;
+ break;
+ default:
+ break;
+ }
+
+ if (!allowed_fallback)
+ gfp_mask |= __GFP_THISNODE;
+
+alloc_new:
folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
if (!folio)
return NULL;
@@ -2621,7 +2646,7 @@ struct folio
*alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h,
/* folio migration callback function */
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int
preferred_nid,
- nodemask_t *nmask, gfp_t gfp_mask)
return NULL;
@@ -2621,7 +2646,7 @@ struct folio
*alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h,
/* folio migration callback function */
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int
preferred_nid,
- nodemask_t *nmask, gfp_t gfp_mask)
+ nodemask_t *nmask, gfp_t gfp_mask, int reason)
{
spin_lock_irq(&hugetlb_lock);
if (available_huge_pages(h)) {
@@ -2636,7 +2661,7 @@ struct folio *alloc_hugetlb_folio_nodemask(struct
hstate *h, int preferred_nid,
}
spin_unlock_irq(&hugetlb_lock);
- return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid,
nmask);
+ return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid,
nmask, reason);
}
/*
@@ -6653,7 +6678,13 @@ static struct folio
*alloc_hugetlb_folio_vma(struct hstate *h,
gfp_mask = htlb_alloc_mask(h);
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
- folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask);
+ /*
+ * This is used to allocate a temporary hugetlb to hold the copied
+ * content, which will then be copied again to the final hugetlb
+ * consuming a reservation. Set the migrate reason to -1 to indicate
+ * that breaking the per-node hugetlb pool is not allowed in
this case.
+ */
+ folio = alloc_hugetlb_folio_nodemask(h, node, nodemask,
gfp_mask, -1);
mpol_cond_put(mpol);
return folio;
What do you think? Thanks.
On Mon, Feb 26, 2024 at 11:34:51AM +0800, Baolin Wang wrote:
> IMO, I'm not sure whether it's appropriate to decouple
> dequeue_hugetlb_folio_nodemask() from alloc_hugetlb_folio_nodemask() into
> two separate functions for the users to call, because these details should
> be hidden within the hugetlb core implementation.
>
> Instead, I can move the gfp_mask fiddling into a new helper, and move the
> helper into alloc_migrate_hugetlb_folio(). Temporary hugetlb allocation has
> its own gfp strategy seems reasonable to me.
An alternative would be to do the following, which does not futher carry
the "reason" argument into hugetlb code.
(Not even compile tested, just a PoC)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index c1ee640d87b1..8a89a1007dcb 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -970,6 +970,24 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
return modified_mask;
}
+static inline bool htlb_allow_fallback(int reason)
+{
+ bool allowed_fallback = false;
+
+ switch (reason) {
+ case MR_MEMORY_HOTPLUG:
+ case MR_MEMORY_FAILURE:
+ case MR_SYSCALL:
+ case MR_MEMPOLICY_MBIND:
+ allowed_fallback = true;
+ break;
+ default:
+ break;
+ }
+
+ return allowed_fallback;
+}
+
static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
struct mm_struct *mm, pte_t *pte)
{
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ed1581b670d4..7e8d6b5885d6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2619,7 +2619,7 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h,
/* folio migration callback function */
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
- nodemask_t *nmask, gfp_t gfp_mask)
+ nodemask_t *nmask, gfp_t gfp_mask, bool allow_fallback)
{
spin_lock_irq(&hugetlb_lock);
if (available_huge_pages(h)) {
@@ -2634,6 +2634,12 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
}
spin_unlock_irq(&hugetlb_lock);
+ /*
+ * We cannot fallback to other nodes, as we could break the per-node pool
+ */
+ if (!allow_fallback)
+ gfp_mask |= GFP_THISNODE;
+
return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
}
diff --git a/mm/migrate.c b/mm/migrate.c
index cc9f2bcd73b4..8820009acadd 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2016,10 +2016,13 @@ struct folio *alloc_migration_target(struct folio *src, unsigned long private)
if (folio_test_hugetlb(src)) {
struct hstate *h = folio_hstate(src);
+ bool allow_fallback;
gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
+ allow_fallback = htlb_allow_fallback(mtc->reason);
return alloc_hugetlb_folio_nodemask(h, nid,
- mtc->nmask, gfp_mask);
+ mtc->nmask, gfp_mask,
+ allow_fallback);
}
if (folio_test_large(src)) {
--
Oscar Salvador
SUSE Labs