Avoid doing a per cpu read and use the local variable thisnid. IMHO
this also makes the code more readable.
Signed-off-by: Aneesh Kumar K.V (IBM) <[email protected]>
Signed-off-by: Donet Tom <[email protected]>
---
mm/mempolicy.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 10a590ee1c89..8478574c000c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2526,7 +2526,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
if (node_isset(curnid, pol->nodes))
goto out;
z = first_zones_zonelist(
- node_zonelist(numa_node_id(), GFP_HIGHUSER),
+ node_zonelist(thisnid, GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
&pol->nodes);
polnid = zone_to_nid(z->zone);
--
2.39.3
commit bda420b98505 ("numa balancing: migrate on fault among multiple bound
nodes") added support for migrate on protnone reference with MPOL_BIND
memory policy. This allowed numa fault migration when the executing node
is part of the policy mask for MPOL_BIND. This patch extends migration
support to MPOL_PREFERRED_MANY policy.
Currently, we cannot specify MPOL_PREFERRED_MANY with the mempolicy flag
MPOL_F_NUMA_BALANCING. This causes issues when we want to use
NUMA_BALANCING_MEMORY_TIERING. To effectively use the slow memory tier,
the kernel should not allocate pages from the slower memory tier via
allocation control zonelist fallback. Instead, we should move cold pages
from the faster memory node via memory demotion. For a page allocation,
kswapd is only woken up after we try to allocate pages from all nodes in
the allocation zone list. This implies that, without using memory
policies, we will end up allocating hot pages in the slower memory tier.
MPOL_PREFERRED_MANY was added by commit b27abaccf8e8 ("mm/mempolicy: add
MPOL_PREFERRED_MANY for multiple preferred nodes") to allow better
allocation control when we have memory tiers in the system. With
MPOL_PREFERRED_MANY, the user can use a policy node mask consisting only
of faster memory nodes. When we fail to allocate pages from the faster
memory node, kswapd would be woken up, allowing demotion of cold pages
to slower memory nodes.
With the current kernel, such usage of memory policies implies we can't
do page promotion from a slower memory tier to a faster memory tier
using numa fault. This patch fixes this issue.
For MPOL_PREFERRED_MANY, if the executing node is in the policy node
mask, we allow numa migration to the executing nodes. If the executing
node is not in the policy node mask but the folio is already allocated
based on policy preference (the folio node is in the policy node mask),
we don't allow numa migration. If both the executing node and folio node
are outside the policy node mask, we allow numa migration to the
executing nodes.
Signed-off-by: Aneesh Kumar K.V (IBM) <[email protected]>
Signed-off-by: Donet Tom <[email protected]>
---
mm/mempolicy.c | 28 ++++++++++++++++++++++++++--
1 file changed, 26 insertions(+), 2 deletions(-)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 73d698e21dae..8c4c92b10371 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1458,9 +1458,10 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
return -EINVAL;
if (*flags & MPOL_F_NUMA_BALANCING) {
- if (*mode != MPOL_BIND)
+ if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
+ *flags |= (MPOL_F_MOF | MPOL_F_MORON);
+ else
return -EINVAL;
- *flags |= (MPOL_F_MOF | MPOL_F_MORON);
}
return 0;
}
@@ -2463,6 +2464,23 @@ static void sp_free(struct sp_node *n)
kmem_cache_free(sn_cache, n);
}
+static inline bool mpol_preferred_should_numa_migrate(int exec_node, int folio_node,
+ struct mempolicy *pol)
+{
+ /* if the executing node is in the policy node mask, migrate */
+ if (node_isset(exec_node, pol->nodes))
+ return true;
+
+ /* If the folio node is in policy node mask, don't migrate */
+ if (node_isset(folio_node, pol->nodes))
+ return false;
+ /*
+ * both the folio node and executing node are outside the policy nodemask,
+ * migrate as normal numa fault migration.
+ */
+ return true;
+}
+
/**
* mpol_misplaced - check whether current folio node is valid in policy
*
@@ -2526,6 +2544,12 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
break;
case MPOL_PREFERRED_MANY:
+ if (pol->flags & MPOL_F_MORON) {
+ if (!mpol_preferred_should_numa_migrate(thisnid, curnid, pol))
+ goto out;
+ break;
+ }
+
/*
* use current page if in policy nodemask,
* else select nearest allowed node, if any.
--
2.39.3
On Sat, 17 Feb 2024 01:31:33 -0600 Donet Tom <[email protected]> wrote:
> Avoid doing a per cpu read and use the local variable thisnid. IMHO
> this also makes the code more readable.
>
> ...
>
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -2526,7 +2526,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
> if (node_isset(curnid, pol->nodes))
> goto out;
> z = first_zones_zonelist(
> - node_zonelist(numa_node_id(), GFP_HIGHUSER),
> + node_zonelist(thisnid, GFP_HIGHUSER),
> gfp_zone(GFP_HIGHUSER),
> &pol->nodes);
> polnid = zone_to_nid(z->zone);
int thisnid = cpu_to_node(thiscpu);
Is there any dofference between numa_node_id() and
cpu_to_node(raw_smp_processor_id())? And it it explicable that we're
using one here and not the other?
On 2/19/24 03:08, Andrew Morton wrote:
> On Sat, 17 Feb 2024 01:31:33 -0600 Donet Tom <[email protected]> wrote:
>
>> Avoid doing a per cpu read and use the local variable thisnid. IMHO
>> this also makes the code more readable.
>>
>> ...
>>
>> --- a/mm/mempolicy.c
>> +++ b/mm/mempolicy.c
>> @@ -2526,7 +2526,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
>> if (node_isset(curnid, pol->nodes))
>> goto out;
>> z = first_zones_zonelist(
>> - node_zonelist(numa_node_id(), GFP_HIGHUSER),
>> + node_zonelist(thisnid, GFP_HIGHUSER),
>> gfp_zone(GFP_HIGHUSER),
>> &pol->nodes);
>> polnid = zone_to_nid(z->zone);
> int thisnid = cpu_to_node(thiscpu);
>
> Is there any dofference between numa_node_id() and
> cpu_to_node(raw_smp_processor_id())? And it it explicable that we're
> using one here and not the other?
Hi Andrew
Both numa_node_id() and cpu_to_node(raw_smp_processor_id()) return the current execution node id,
Since the current execution node is already fetched at the beginning (thisnid) we can reuse it instead of getting it again.
Thanks
Donet Tom
>
On Sat 17-02-24 01:31:35, Donet Tom wrote:
> commit bda420b98505 ("numa balancing: migrate on fault among multiple bound
> nodes") added support for migrate on protnone reference with MPOL_BIND
> memory policy. This allowed numa fault migration when the executing node
> is part of the policy mask for MPOL_BIND. This patch extends migration
> support to MPOL_PREFERRED_MANY policy.
>
> Currently, we cannot specify MPOL_PREFERRED_MANY with the mempolicy flag
> MPOL_F_NUMA_BALANCING. This causes issues when we want to use
> NUMA_BALANCING_MEMORY_TIERING. To effectively use the slow memory tier,
> the kernel should not allocate pages from the slower memory tier via
> allocation control zonelist fallback. Instead, we should move cold pages
> from the faster memory node via memory demotion. For a page allocation,
> kswapd is only woken up after we try to allocate pages from all nodes in
> the allocation zone list. This implies that, without using memory
> policies, we will end up allocating hot pages in the slower memory tier.
>
> MPOL_PREFERRED_MANY was added by commit b27abaccf8e8 ("mm/mempolicy: add
> MPOL_PREFERRED_MANY for multiple preferred nodes") to allow better
> allocation control when we have memory tiers in the system. With
> MPOL_PREFERRED_MANY, the user can use a policy node mask consisting only
> of faster memory nodes. When we fail to allocate pages from the faster
> memory node, kswapd would be woken up, allowing demotion of cold pages
> to slower memory nodes.
>
> With the current kernel, such usage of memory policies implies we can't
> do page promotion from a slower memory tier to a faster memory tier
> using numa fault. This patch fixes this issue.
>
> For MPOL_PREFERRED_MANY, if the executing node is in the policy node
> mask, we allow numa migration to the executing nodes. If the executing
> node is not in the policy node mask but the folio is already allocated
> based on policy preference (the folio node is in the policy node mask),
> we don't allow numa migration. If both the executing node and folio node
> are outside the policy node mask, we allow numa migration to the
> executing nodes.
The feature makes sense to me. How has this been tested? Do you have any
numbers to present?
> Signed-off-by: Aneesh Kumar K.V (IBM) <[email protected]>
> Signed-off-by: Donet Tom <[email protected]>
> ---
> mm/mempolicy.c | 28 ++++++++++++++++++++++++++--
> 1 file changed, 26 insertions(+), 2 deletions(-)
I haven't spotted anything obviously wrong in the patch itself but I
admit this is not an area I am actively familiar with so I might be
missing something.
--
Michal Hocko
SUSE Labs
On 2/19/24 17:37, Michal Hocko wrote:
> On Sat 17-02-24 01:31:35, Donet Tom wrote:
>> commit bda420b98505 ("numa balancing: migrate on fault among multiple bound
>> nodes") added support for migrate on protnone reference with MPOL_BIND
>> memory policy. This allowed numa fault migration when the executing node
>> is part of the policy mask for MPOL_BIND. This patch extends migration
>> support to MPOL_PREFERRED_MANY policy.
>>
>> Currently, we cannot specify MPOL_PREFERRED_MANY with the mempolicy flag
>> MPOL_F_NUMA_BALANCING. This causes issues when we want to use
>> NUMA_BALANCING_MEMORY_TIERING. To effectively use the slow memory tier,
>> the kernel should not allocate pages from the slower memory tier via
>> allocation control zonelist fallback. Instead, we should move cold pages
>> from the faster memory node via memory demotion. For a page allocation,
>> kswapd is only woken up after we try to allocate pages from all nodes in
>> the allocation zone list. This implies that, without using memory
>> policies, we will end up allocating hot pages in the slower memory tier.
>>
>> MPOL_PREFERRED_MANY was added by commit b27abaccf8e8 ("mm/mempolicy: add
>> MPOL_PREFERRED_MANY for multiple preferred nodes") to allow better
>> allocation control when we have memory tiers in the system. With
>> MPOL_PREFERRED_MANY, the user can use a policy node mask consisting only
>> of faster memory nodes. When we fail to allocate pages from the faster
>> memory node, kswapd would be woken up, allowing demotion of cold pages
>> to slower memory nodes.
>>
>> With the current kernel, such usage of memory policies implies we can't
>> do page promotion from a slower memory tier to a faster memory tier
>> using numa fault. This patch fixes this issue.
>>
>> For MPOL_PREFERRED_MANY, if the executing node is in the policy node
>> mask, we allow numa migration to the executing nodes. If the executing
>> node is not in the policy node mask but the folio is already allocated
>> based on policy preference (the folio node is in the policy node mask),
>> we don't allow numa migration. If both the executing node and folio node
>> are outside the policy node mask, we allow numa migration to the
>> executing nodes.
> The feature makes sense to me. How has this been tested? Do you have any
> numbers to present?
Hi Michal
I have a test program which allocate memory on a specified node and
trigger the promotion or migration (Keep accessing the pages).
Without this patch if we set MPOL_PREFERRED_MANY promotion or migration was not happening
with this patch I could see pages are getting migrated or promoted.
My system has 2 CPU+DRAM node (Tier 1) and 1 PMEM node(Tier 2). Below
are my test results.
In below table N0 and N1 are Tier1 Nodes. N6 is the Tier2 Node.
Exec_Node is the execution node, Policy is the nodes in nodemask and
"Curr Location Pages" is the node where pages present before migration
or promotion start.
Tests Results
------------------
Scenario 1: if the executing node is in the policy node mask
================================================================================
Exec_Node Policy Curr Location Pages Observations
================================================================================
N0 N0 N1 N6 N1 Pages Migrated from N1 to N0
N0 N0 N1 N6 N6 Pages Promoted from N6 to N0
N0 N0 N1 N1 Pages Migrated from N1 to N0
N0 N0 N1 N6 Pages Promoted from N6 to N0
Scenario 2: If the folio node is in policy node mask and Exec node not in policy node mask
================================================================================
Exec_Node Policy Curr Location Pages Observations
================================================================================
N0 N1 N6 N1 Pages are not Migrating to N0
N0 N1 N6 N6 Pages are not migration to N0
N0 N1 N1 Pages are not Migrating to N0
Scenario 3: both the folio node and executing node are outside the policy nodemask
==============================================================================
Exec_Node Policy Curr Location Pages Observations
==============================================================================
N0 N1 N6 Pages Promoted from N6 to N0
N0 N6 N1 Pages Migrated from N1 to N0
Thanks
Donet Tom
>
>> Signed-off-by: Aneesh Kumar K.V (IBM) <[email protected]>
>> Signed-off-by: Donet Tom <[email protected]>
>> ---
>> mm/mempolicy.c | 28 ++++++++++++++++++++++++++--
>> 1 file changed, 26 insertions(+), 2 deletions(-)
> I haven't spotted anything obviously wrong in the patch itself but I
> admit this is not an area I am actively familiar with so I might be
> missing something.
On Sat 17-02-24 01:31:35, Donet Tom wrote:
[...]
> +static inline bool mpol_preferred_should_numa_migrate(int exec_node, int folio_node,
> + struct mempolicy *pol)
> +{
> + /* if the executing node is in the policy node mask, migrate */
> + if (node_isset(exec_node, pol->nodes))
> + return true;
> +
> + /* If the folio node is in policy node mask, don't migrate */
> + if (node_isset(folio_node, pol->nodes))
> + return false;
> + /*
> + * both the folio node and executing node are outside the policy nodemask,
> + * migrate as normal numa fault migration.
> + */
> + return true;
> +}
I have looked at this again and only now noticed that this doesn't
really work as one would expected.
case MPOL_PREFERRED_MANY:
/*
* use current page if in policy nodemask,
* else select nearest allowed node, if any.
* If no allowed nodes, use current [!misplaced].
*/
if (node_isset(curnid, pol->nodes))
goto out;
z = first_zones_zonelist(
node_zonelist(numa_node_id(), GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
&pol->nodes);
polnid = zone_to_nid(z->zone);
break;
Will collapse the whole MPOL_PREFERRED_MANY nodemask into the first
notde into that mask. Is that really what we want here? Shouldn't we use
the full nodemask as the migration target?
--
Michal Hocko
SUSE Labs
On 2/19/24 19:50, Michal Hocko wrote:
> On Sat 17-02-24 01:31:35, Donet Tom wrote:
> [...]
>> +static inline bool mpol_preferred_should_numa_migrate(int exec_node, int folio_node,
>> + struct mempolicy *pol)
>> +{
>> + /* if the executing node is in the policy node mask, migrate */
>> + if (node_isset(exec_node, pol->nodes))
>> + return true;
>> +
>> + /* If the folio node is in policy node mask, don't migrate */
>> + if (node_isset(folio_node, pol->nodes))
>> + return false;
>> + /*
>> + * both the folio node and executing node are outside the policy nodemask,
>> + * migrate as normal numa fault migration.
>> + */
>> + return true;
>> +}
> I have looked at this again and only now noticed that this doesn't
> really work as one would expected.
>
> case MPOL_PREFERRED_MANY:
> /*
> * use current page if in policy nodemask,
> * else select nearest allowed node, if any.
> * If no allowed nodes, use current [!misplaced].
> */
> if (node_isset(curnid, pol->nodes))
> goto out;
> z = first_zones_zonelist(
> node_zonelist(numa_node_id(), GFP_HIGHUSER),
> gfp_zone(GFP_HIGHUSER),
> &pol->nodes);
> polnid = zone_to_nid(z->zone);
> break;
>
> Will collapse the whole MPOL_PREFERRED_MANY nodemask into the first
> notde into that mask. Is that really what we want here? Shouldn't we use
> the full nodemask as the migration target?
With this patch it will take full nodemask and find out the correct migration target. It will not collapse into first node.
For example if we have 5 NUMA nodes in our system N1 to N5, all five are in nodemask and the execution node is N3.
with this fix mpol_preferred_should_numa_migrate() will return true because the execution node is there in the nodemask.
So mpol_misplaced() will select N3 as the migration target since MPOL_F_MORON is set and migrate the pages to N3.
/* Migrate the folio towards the node whose CPU is referencing it */
if (pol->flags & MPOL_F_MORON) {
polnid = thisnid;
So with this patch pages will get migrated to the correct migration target.
>
On Mon 19-02-24 20:37:17, Donet Tom wrote:
>
> On 2/19/24 19:50, Michal Hocko wrote:
> > On Sat 17-02-24 01:31:35, Donet Tom wrote:
> > [...]
> > > +static inline bool mpol_preferred_should_numa_migrate(int exec_node, int folio_node,
> > > + struct mempolicy *pol)
> > > +{
> > > + /* if the executing node is in the policy node mask, migrate */
> > > + if (node_isset(exec_node, pol->nodes))
> > > + return true;
> > > +
> > > + /* If the folio node is in policy node mask, don't migrate */
> > > + if (node_isset(folio_node, pol->nodes))
> > > + return false;
> > > + /*
> > > + * both the folio node and executing node are outside the policy nodemask,
> > > + * migrate as normal numa fault migration.
> > > + */
> > > + return true;
> > > +}
> > I have looked at this again and only now noticed that this doesn't
> > really work as one would expected.
> >
> > case MPOL_PREFERRED_MANY:
> > /*
> > * use current page if in policy nodemask,
> > * else select nearest allowed node, if any.
> > * If no allowed nodes, use current [!misplaced].
> > */
> > if (node_isset(curnid, pol->nodes))
> > goto out;
> > z = first_zones_zonelist(
> > node_zonelist(numa_node_id(), GFP_HIGHUSER),
> > gfp_zone(GFP_HIGHUSER),
> > &pol->nodes);
> > polnid = zone_to_nid(z->zone);
> > break;
> >
> > Will collapse the whole MPOL_PREFERRED_MANY nodemask into the first
> > notde into that mask. Is that really what we want here? Shouldn't we use
> > the full nodemask as the migration target?
>
> With this patch it will take full nodemask and find out the correct migration target. It will not collapse into first node.
Correct me if I am wrong, but mpol_misplaced will return the first node
of the preffered node mask and then migrate_misplaced_folio would use
it as a target node for alloc_misplaced_dst_folio which performs
__GFP_THISNODE allocation so it won't fall back to a different node.
--
Michal Hocko
SUSE Labs
On Mon, 19 Feb 2024 14:04:23 +0530 Donet Tom <[email protected]> wrote:
> >> --- a/mm/mempolicy.c
> >> +++ b/mm/mempolicy.c
> >> @@ -2526,7 +2526,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
> >> if (node_isset(curnid, pol->nodes))
> >> goto out;
> >> z = first_zones_zonelist(
> >> - node_zonelist(numa_node_id(), GFP_HIGHUSER),
> >> + node_zonelist(thisnid, GFP_HIGHUSER),
> >> gfp_zone(GFP_HIGHUSER),
> >> &pol->nodes);
> >> polnid = zone_to_nid(z->zone);
> > int thisnid = cpu_to_node(thiscpu);
> >
> > Is there any dofference between numa_node_id() and
> > cpu_to_node(raw_smp_processor_id())? And it it explicable that we're
> > using one here and not the other?
>
> Hi Andrew
>
> Both numa_node_id() and cpu_to_node(raw_smp_processor_id()) return the current execution node id,
> Since the current execution node is already fetched at the beginning (thisnid) we can reuse it instead of getting it again.
Sure, but mine was a broader thought: why do we have both? Is one
preferable and if so why?
On 2/20/24 12:42 AM, Michal Hocko wrote:
> On Mon 19-02-24 20:37:17, Donet Tom wrote:
>>
>> On 2/19/24 19:50, Michal Hocko wrote:
>>> On Sat 17-02-24 01:31:35, Donet Tom wrote:
>>> [...]
>>>> +static inline bool mpol_preferred_should_numa_migrate(int exec_node, int folio_node,
>>>> + struct mempolicy *pol)
>>>> +{
>>>> + /* if the executing node is in the policy node mask, migrate */
>>>> + if (node_isset(exec_node, pol->nodes))
>>>> + return true;
>>>> +
>>>> + /* If the folio node is in policy node mask, don't migrate */
>>>> + if (node_isset(folio_node, pol->nodes))
>>>> + return false;
>>>> + /*
>>>> + * both the folio node and executing node are outside the policy nodemask,
>>>> + * migrate as normal numa fault migration.
>>>> + */
>>>> + return true;
>>>> +}
>>> I have looked at this again and only now noticed that this doesn't
>>> really work as one would expected.
>>>
>>> case MPOL_PREFERRED_MANY:
>>> /*
>>> * use current page if in policy nodemask,
>>> * else select nearest allowed node, if any.
>>> * If no allowed nodes, use current [!misplaced].
>>> */
>>> if (node_isset(curnid, pol->nodes))
>>> goto out;
>>> z = first_zones_zonelist(
>>> node_zonelist(numa_node_id(), GFP_HIGHUSER),
>>> gfp_zone(GFP_HIGHUSER),
>>> &pol->nodes);
>>> polnid = zone_to_nid(z->zone);
>>> break;
>>>
>>> Will collapse the whole MPOL_PREFERRED_MANY nodemask into the first
>>> notde into that mask. Is that really what we want here? Shouldn't we use
>>> the full nodemask as the migration target?
>>
>> With this patch it will take full nodemask and find out the correct migration target. It will not collapse into first node.
>
> Correct me if I am wrong, but mpol_misplaced will return the first node
> of the preffered node mask and then migrate_misplaced_folio would use
> it as a target node for alloc_misplaced_dst_folio which performs
> __GFP_THISNODE allocation so it won't fall back to a different node.
I think the confusion is between MPOL_F_MOF (migrate on fault) vs MPOL_F_MORON( protnone fault/numa fault).
With MPOL_F_MOF alone what we wanted to achieve was to have have mbind() lazy migrate the pages based on policy node
mask. The change was introduced in commit commit b24f53a0bea3 ("mm: mempolicy: Add MPOL_MF_LAZY") and later dropped by
commit 2cafb582173f ("mempolicy: remove confusing MPOL_MF_LAZY dead code"). We still have mpol_misplaced changes
to handle the node selection for MPOL_F_MOF flag (this is dead code IIUC).
MPOL_F_MORON was added in commit 5606e3877ad8 ("mm: numa: Migrate on reference policy") and with currently upstream only
MPOL_BIND support that flag. With that flag specified and with the changes in the patch mpol_misplaced becomes
case MPOL_PREFERRED_MANY:
if (pol->flags & MPOL_F_MORON) {
if (!mpol_preferred_should_numa_migrate(thisnid, curnid, pol))
goto out;
break;
}
/*
* use current page if in policy nodemask,
* else select nearest allowed node, if any.
* If no allowed nodes, use current [!misplaced].
*/
if (node_isset(curnid, pol->nodes))
goto out;
z = first_zones_zonelist(
node_zonelist(thisnid, GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
&pol->nodes);
polnid = zone_to_nid(z->zone);
break;
....
..
}
/* Migrate the folio towards the node whose CPU is referencing it */
if (pol->flags & MPOL_F_MORON) {
polnid = thisnid;
if (!should_numa_migrate_memory(current, folio, curnid,
thiscpu))
goto out;
}
if (curnid != polnid)
ret = polnid;
out:
mpol_cond_put(pol);
return ret;
}
ie, if we can do numa migration, we select the currently executing node as the target node otherwise
we end up returning from the function with ret = NUMA_NO_NODE.
-aneesh
On 2/20/24 6:51 AM, Andrew Morton wrote:
> On Mon, 19 Feb 2024 14:04:23 +0530 Donet Tom <[email protected]> wrote:
>
>>>> --- a/mm/mempolicy.c
>>>> +++ b/mm/mempolicy.c
>>>> @@ -2526,7 +2526,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
>>>> if (node_isset(curnid, pol->nodes))
>>>> goto out;
>>>> z = first_zones_zonelist(
>>>> - node_zonelist(numa_node_id(), GFP_HIGHUSER),
>>>> + node_zonelist(thisnid, GFP_HIGHUSER),
>>>> gfp_zone(GFP_HIGHUSER),
>>>> &pol->nodes);
>>>> polnid = zone_to_nid(z->zone);
>>> int thisnid = cpu_to_node(thiscpu);
>>>
>>> Is there any dofference between numa_node_id() and
>>> cpu_to_node(raw_smp_processor_id())? And it it explicable that we're
>>> using one here and not the other?
>>
>> Hi Andrew
>>
>> Both numa_node_id() and cpu_to_node(raw_smp_processor_id()) return the current execution node id,
>> Since the current execution node is already fetched at the beginning (thisnid) we can reuse it instead of getting it again.
>
> Sure, but mine was a broader thought: why do we have both? Is one
> preferable and if so why?
IIUC these are two helpers to fetch current numa node id. and either of them can be used based on need. The default implementation shows the details.
(One small difference is numa_node_id() can use optimized per cpu reader because it is fetching the per cpu variable of the currently running cpu.)
#ifndef numa_node_id
/* Returns the number of the current Node. */
static inline int numa_node_id(void)
{
return raw_cpu_read(numa_node);
}
#endif
#ifndef cpu_to_node
static inline int cpu_to_node(int cpu)
{
return per_cpu(numa_node, cpu);
}
#endif
In mpol_misplaced function, we need the cpu details because we are using that in other place (should_numa_migreate_memory()). So it makes it easy
to use cpu_to_node(thiscpu) instead of numa_node_id().
-aneesh
On 2/20/24 11:55 AM, Huang, Ying wrote:
> "Aneesh Kumar K.V" <[email protected]> writes:
>
>> On 2/20/24 6:51 AM, Andrew Morton wrote:
>>> On Mon, 19 Feb 2024 14:04:23 +0530 Donet Tom <[email protected]> wrote:
>>>
>>>>>> --- a/mm/mempolicy.c
>>>>>> +++ b/mm/mempolicy.c
>>>>>> @@ -2526,7 +2526,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
>>>>>> if (node_isset(curnid, pol->nodes))
>>>>>> goto out;
>>>>>> z = first_zones_zonelist(
>>>>>> - node_zonelist(numa_node_id(), GFP_HIGHUSER),
>>>>>> + node_zonelist(thisnid, GFP_HIGHUSER),
>>>>>> gfp_zone(GFP_HIGHUSER),
>>>>>> &pol->nodes);
>>>>>> polnid = zone_to_nid(z->zone);
>>>>> int thisnid = cpu_to_node(thiscpu);
>>>>>
>>>>> Is there any dofference between numa_node_id() and
>>>>> cpu_to_node(raw_smp_processor_id())? And it it explicable that we're
>>>>> using one here and not the other?
>>>>
>>>> Hi Andrew
>>>>
>>>> Both numa_node_id() and cpu_to_node(raw_smp_processor_id()) return the current execution node id,
>>>> Since the current execution node is already fetched at the beginning (thisnid) we can reuse it instead of getting it again.
>>>
>>> Sure, but mine was a broader thought: why do we have both? Is one
>>> preferable and if so why?
>>
>> IIUC these are two helpers to fetch current numa node id. and either of them can be used based on need. The default implementation shows the details.
>> (One small difference is numa_node_id() can use optimized per cpu reader because it is fetching the per cpu variable of the currently running cpu.)
>>
>> #ifndef numa_node_id
>> /* Returns the number of the current Node. */
>> static inline int numa_node_id(void)
>> {
>> return raw_cpu_read(numa_node);
>> }
>> #endif
>>
>> #ifndef cpu_to_node
>> static inline int cpu_to_node(int cpu)
>> {
>> return per_cpu(numa_node, cpu);
>> }
>> #endif
>>
>> In mpol_misplaced function, we need the cpu details because we are using that in other place (should_numa_migreate_memory()). So it makes it easy
>> to use cpu_to_node(thiscpu) instead of numa_node_id().
>
> IIUC, numa_node_id() is faster than cpu_to_node(thiscpu), even if we
> have thiscpu already. cpu_to_node() is mainly used to get the node of
> NOT current CPU. So, IMHO, we should only use numa_node_id() in this
> function.
>
This change?
modified mm/mempolicy.c
@@ -2502,8 +2502,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
pgoff_t ilx;
struct zoneref *z;
int curnid = folio_nid(folio);
- int thiscpu = raw_smp_processor_id();
- int thisnid = cpu_to_node(thiscpu);
+ int thisnid = numa_node_id();
int polnid = NUMA_NO_NODE;
int ret = NUMA_NO_NODE;
@@ -2573,7 +2572,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
polnid = thisnid;
if (!should_numa_migrate_memory(current, folio, curnid,
- thiscpu))
+ raw_smp_processor_id()))
goto out;
}
-aneesh
Donet Tom <[email protected]> writes:
> On 2/19/24 17:37, Michal Hocko wrote:
>> On Sat 17-02-24 01:31:35, Donet Tom wrote:
>>> commit bda420b98505 ("numa balancing: migrate on fault among multiple bound
>>> nodes") added support for migrate on protnone reference with MPOL_BIND
>>> memory policy. This allowed numa fault migration when the executing node
>>> is part of the policy mask for MPOL_BIND. This patch extends migration
>>> support to MPOL_PREFERRED_MANY policy.
>>>
>>> Currently, we cannot specify MPOL_PREFERRED_MANY with the mempolicy flag
>>> MPOL_F_NUMA_BALANCING. This causes issues when we want to use
>>> NUMA_BALANCING_MEMORY_TIERING. To effectively use the slow memory tier,
>>> the kernel should not allocate pages from the slower memory tier via
>>> allocation control zonelist fallback. Instead, we should move cold pages
>>> from the faster memory node via memory demotion. For a page allocation,
>>> kswapd is only woken up after we try to allocate pages from all nodes in
>>> the allocation zone list. This implies that, without using memory
>>> policies, we will end up allocating hot pages in the slower memory tier.
>>>
>>> MPOL_PREFERRED_MANY was added by commit b27abaccf8e8 ("mm/mempolicy: add
>>> MPOL_PREFERRED_MANY for multiple preferred nodes") to allow better
>>> allocation control when we have memory tiers in the system. With
>>> MPOL_PREFERRED_MANY, the user can use a policy node mask consisting only
>>> of faster memory nodes. When we fail to allocate pages from the faster
>>> memory node, kswapd would be woken up, allowing demotion of cold pages
>>> to slower memory nodes.
>>>
>>> With the current kernel, such usage of memory policies implies we can't
>>> do page promotion from a slower memory tier to a faster memory tier
>>> using numa fault. This patch fixes this issue.
>>>
>>> For MPOL_PREFERRED_MANY, if the executing node is in the policy node
>>> mask, we allow numa migration to the executing nodes. If the executing
>>> node is not in the policy node mask but the folio is already allocated
>>> based on policy preference (the folio node is in the policy node mask),
>>> we don't allow numa migration. If both the executing node and folio node
>>> are outside the policy node mask, we allow numa migration to the
>>> executing nodes.
>> The feature makes sense to me. How has this been tested? Do you have any
>> numbers to present?
>
> Hi Michal
>
> I have a test program which allocate memory on a specified node and
> trigger the promotion or migration (Keep accessing the pages).
>
> Without this patch if we set MPOL_PREFERRED_MANY promotion or migration was not happening
> with this patch I could see pages are getting migrated or promoted.
>
> My system has 2 CPU+DRAM node (Tier 1) and 1 PMEM node(Tier 2). Below
> are my test results.
>
> In below table N0 and N1 are Tier1 Nodes. N6 is the Tier2 Node.
> Exec_Node is the execution node, Policy is the nodes in nodemask and
> "Curr Location Pages" is the node where pages present before migration
> or promotion start.
>
> Tests Results
> ------------------
> Scenario 1: if the executing node is in the policy node mask
> ================================================================================
> Exec_Node Policy Curr Location Pages Observations
> ================================================================================
> N0 N0 N1 N6 N1 Pages Migrated from N1 to N0
> N0 N0 N1 N6 N6 Pages Promoted from N6 to N0
> N0 N0 N1 N1 Pages Migrated from N1 to N0
> N0 N0 N1 N6 Pages Promoted from N6 to N0
>
> Scenario 2: If the folio node is in policy node mask and Exec node not in policy node mask
> ================================================================================
> Exec_Node Policy Curr Location Pages Observations
> ================================================================================
> N0 N1 N6 N1 Pages are not Migrating to N0
> N0 N1 N6 N6 Pages are not migration to N0
> N0 N1 N1 Pages are not Migrating to N0
>
> Scenario 3: both the folio node and executing node are outside the policy nodemask
> ==============================================================================
> Exec_Node Policy Curr Location Pages Observations
> ==============================================================================
> N0 N1 N6 Pages Promoted from N6 to N0
> N0 N6 N1 Pages Migrated from N1 to N0
>
Please use some benchmarks (e.g., redis + memtier) and show the
proc-vmstat stats and benchamrk score.
Not part of the kernel series, but don't forget to submit patches to the
man pages project and numactl tool to let users use it.
--
Best Regards,
Huang, Ying
> Thanks
> Donet Tom
>
>>
>>> Signed-off-by: Aneesh Kumar K.V (IBM) <[email protected]>
>>> Signed-off-by: Donet Tom <[email protected]>
>>> ---
>>> mm/mempolicy.c | 28 ++++++++++++++++++++++++++--
>>> 1 file changed, 26 insertions(+), 2 deletions(-)
>> I haven't spotted anything obviously wrong in the patch itself but I
>> admit this is not an area I am actively familiar with so I might be
>> missing something.
"Aneesh Kumar K.V" <[email protected]> writes:
> On 2/20/24 6:51 AM, Andrew Morton wrote:
>> On Mon, 19 Feb 2024 14:04:23 +0530 Donet Tom <[email protected]> wrote:
>>
>>>>> --- a/mm/mempolicy.c
>>>>> +++ b/mm/mempolicy.c
>>>>> @@ -2526,7 +2526,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
>>>>> if (node_isset(curnid, pol->nodes))
>>>>> goto out;
>>>>> z = first_zones_zonelist(
>>>>> - node_zonelist(numa_node_id(), GFP_HIGHUSER),
>>>>> + node_zonelist(thisnid, GFP_HIGHUSER),
>>>>> gfp_zone(GFP_HIGHUSER),
>>>>> &pol->nodes);
>>>>> polnid = zone_to_nid(z->zone);
>>>> int thisnid = cpu_to_node(thiscpu);
>>>>
>>>> Is there any dofference between numa_node_id() and
>>>> cpu_to_node(raw_smp_processor_id())? And it it explicable that we're
>>>> using one here and not the other?
>>>
>>> Hi Andrew
>>>
>>> Both numa_node_id() and cpu_to_node(raw_smp_processor_id()) return the current execution node id,
>>> Since the current execution node is already fetched at the beginning (thisnid) we can reuse it instead of getting it again.
>>
>> Sure, but mine was a broader thought: why do we have both? Is one
>> preferable and if so why?
>
> IIUC these are two helpers to fetch current numa node id. and either of them can be used based on need. The default implementation shows the details.
> (One small difference is numa_node_id() can use optimized per cpu reader because it is fetching the per cpu variable of the currently running cpu.)
>
> #ifndef numa_node_id
> /* Returns the number of the current Node. */
> static inline int numa_node_id(void)
> {
> return raw_cpu_read(numa_node);
> }
> #endif
>
> #ifndef cpu_to_node
> static inline int cpu_to_node(int cpu)
> {
> return per_cpu(numa_node, cpu);
> }
> #endif
>
> In mpol_misplaced function, we need the cpu details because we are using that in other place (should_numa_migreate_memory()). So it makes it easy
> to use cpu_to_node(thiscpu) instead of numa_node_id().
IIUC, numa_node_id() is faster than cpu_to_node(thiscpu), even if we
have thiscpu already. cpu_to_node() is mainly used to get the node of
NOT current CPU. So, IMHO, we should only use numa_node_id() in this
function.
--
Best Regards,
Huang, Ying
On 2/20/24 12:06 PM, Huang, Ying wrote:
> Donet Tom <[email protected]> writes:
>
>> On 2/19/24 17:37, Michal Hocko wrote:
>>> On Sat 17-02-24 01:31:35, Donet Tom wrote:
>>>> commit bda420b98505 ("numa balancing: migrate on fault among multiple bound
>>>> nodes") added support for migrate on protnone reference with MPOL_BIND
>>>> memory policy. This allowed numa fault migration when the executing node
>>>> is part of the policy mask for MPOL_BIND. This patch extends migration
>>>> support to MPOL_PREFERRED_MANY policy.
>>>>
>>>> Currently, we cannot specify MPOL_PREFERRED_MANY with the mempolicy flag
>>>> MPOL_F_NUMA_BALANCING. This causes issues when we want to use
>>>> NUMA_BALANCING_MEMORY_TIERING. To effectively use the slow memory tier,
>>>> the kernel should not allocate pages from the slower memory tier via
>>>> allocation control zonelist fallback. Instead, we should move cold pages
>>>> from the faster memory node via memory demotion. For a page allocation,
>>>> kswapd is only woken up after we try to allocate pages from all nodes in
>>>> the allocation zone list. This implies that, without using memory
>>>> policies, we will end up allocating hot pages in the slower memory tier.
>>>>
>>>> MPOL_PREFERRED_MANY was added by commit b27abaccf8e8 ("mm/mempolicy: add
>>>> MPOL_PREFERRED_MANY for multiple preferred nodes") to allow better
>>>> allocation control when we have memory tiers in the system. With
>>>> MPOL_PREFERRED_MANY, the user can use a policy node mask consisting only
>>>> of faster memory nodes. When we fail to allocate pages from the faster
>>>> memory node, kswapd would be woken up, allowing demotion of cold pages
>>>> to slower memory nodes.
>>>>
>>>> With the current kernel, such usage of memory policies implies we can't
>>>> do page promotion from a slower memory tier to a faster memory tier
>>>> using numa fault. This patch fixes this issue.
>>>>
>>>> For MPOL_PREFERRED_MANY, if the executing node is in the policy node
>>>> mask, we allow numa migration to the executing nodes. If the executing
>>>> node is not in the policy node mask but the folio is already allocated
>>>> based on policy preference (the folio node is in the policy node mask),
>>>> we don't allow numa migration. If both the executing node and folio node
>>>> are outside the policy node mask, we allow numa migration to the
>>>> executing nodes.
>>> The feature makes sense to me. How has this been tested? Do you have any
>>> numbers to present?
>>
>> Hi Michal
>>
>> I have a test program which allocate memory on a specified node and
>> trigger the promotion or migration (Keep accessing the pages).
>>
>> Without this patch if we set MPOL_PREFERRED_MANY promotion or migration was not happening
>> with this patch I could see pages are getting migrated or promoted.
>>
>> My system has 2 CPU+DRAM node (Tier 1) and 1 PMEM node(Tier 2). Below
>> are my test results.
>>
>> In below table N0 and N1 are Tier1 Nodes. N6 is the Tier2 Node.
>> Exec_Node is the execution node, Policy is the nodes in nodemask and
>> "Curr Location Pages" is the node where pages present before migration
>> or promotion start.
>>
>> Tests Results
>> ------------------
>> Scenario 1: if the executing node is in the policy node mask
>> ================================================================================
>> Exec_Node Policy Curr Location Pages Observations
>> ================================================================================
>> N0 N0 N1 N6 N1 Pages Migrated from N1 to N0
>> N0 N0 N1 N6 N6 Pages Promoted from N6 to N0
>> N0 N0 N1 N1 Pages Migrated from N1 to N0
>> N0 N0 N1 N6 Pages Promoted from N6 to N0
>>
>> Scenario 2: If the folio node is in policy node mask and Exec node not in policy node mask
>> ================================================================================
>> Exec_Node Policy Curr Location Pages Observations
>> ================================================================================
>> N0 N1 N6 N1 Pages are not Migrating to N0
>> N0 N1 N6 N6 Pages are not migration to N0
>> N0 N1 N1 Pages are not Migrating to N0
>>
>> Scenario 3: both the folio node and executing node are outside the policy nodemask
>> ==============================================================================
>> Exec_Node Policy Curr Location Pages Observations
>> ==============================================================================
>> N0 N1 N6 Pages Promoted from N6 to N0
>> N0 N6 N1 Pages Migrated from N1 to N0
>>
>
> Please use some benchmarks (e.g., redis + memtier) and show the
> proc-vmstat stats and benchamrk score.
Without this change numa fault migration is not supported with MPOL_PREFERRED_MANY
policy. So there is no performance comparison with and without patch. W.r.t effectiveness of numa
fault migration, that is a different topic from this patch
-aneesh
On 2/20/24 12:02 PM, Aneesh Kumar K.V wrote:
> On 2/20/24 11:55 AM, Huang, Ying wrote:
>> "Aneesh Kumar K.V" <[email protected]> writes:
>>
>>> On 2/20/24 6:51 AM, Andrew Morton wrote:
>>>> On Mon, 19 Feb 2024 14:04:23 +0530 Donet Tom <[email protected]> wrote:
>>>>
>>>>>>> --- a/mm/mempolicy.c
>>>>>>> +++ b/mm/mempolicy.c
>>>>>>> @@ -2526,7 +2526,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
>>>>>>> if (node_isset(curnid, pol->nodes))
>>>>>>> goto out;
>>>>>>> z = first_zones_zonelist(
>>>>>>> - node_zonelist(numa_node_id(), GFP_HIGHUSER),
>>>>>>> + node_zonelist(thisnid, GFP_HIGHUSER),
>>>>>>> gfp_zone(GFP_HIGHUSER),
>>>>>>> &pol->nodes);
>>>>>>> polnid = zone_to_nid(z->zone);
>>>>>> int thisnid = cpu_to_node(thiscpu);
>>>>>>
>>>>>> Is there any dofference between numa_node_id() and
>>>>>> cpu_to_node(raw_smp_processor_id())? And it it explicable that we're
>>>>>> using one here and not the other?
>>>>>
>>>>> Hi Andrew
>>>>>
>>>>> Both numa_node_id() and cpu_to_node(raw_smp_processor_id()) return the current execution node id,
>>>>> Since the current execution node is already fetched at the beginning (thisnid) we can reuse it instead of getting it again.
>>>>
>>>> Sure, but mine was a broader thought: why do we have both? Is one
>>>> preferable and if so why?
>>>
>>> IIUC these are two helpers to fetch current numa node id. and either of them can be used based on need. The default implementation shows the details.
>>> (One small difference is numa_node_id() can use optimized per cpu reader because it is fetching the per cpu variable of the currently running cpu.)
>>>
>>> #ifndef numa_node_id
>>> /* Returns the number of the current Node. */
>>> static inline int numa_node_id(void)
>>> {
>>> return raw_cpu_read(numa_node);
>>> }
>>> #endif
>>>
>>> #ifndef cpu_to_node
>>> static inline int cpu_to_node(int cpu)
>>> {
>>> return per_cpu(numa_node, cpu);
>>> }
>>> #endif
>>>
>>> In mpol_misplaced function, we need the cpu details because we are using that in other place (should_numa_migreate_memory()). So it makes it easy
>>> to use cpu_to_node(thiscpu) instead of numa_node_id().
>>
>> IIUC, numa_node_id() is faster than cpu_to_node(thiscpu), even if we
>> have thiscpu already. cpu_to_node() is mainly used to get the node of
>> NOT current CPU. So, IMHO, we should only use numa_node_id() in this
>> function.
>>
>
> This change?
>
> modified mm/mempolicy.c
> @@ -2502,8 +2502,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
> pgoff_t ilx;
> struct zoneref *z;
> int curnid = folio_nid(folio);
> - int thiscpu = raw_smp_processor_id();
> - int thisnid = cpu_to_node(thiscpu);
> + int thisnid = numa_node_id();
> int polnid = NUMA_NO_NODE;
> int ret = NUMA_NO_NODE;
>
> @@ -2573,7 +2572,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
> polnid = thisnid;
>
> if (!should_numa_migrate_memory(current, folio, curnid,
> - thiscpu))
> + raw_smp_processor_id()))
> goto out;
> }
>
>
One of the problem with the above change will be the need to make sure smp processor id remaining stable, which
I am not sure we want in this function. With that we can end up with processor id not related to the numa node id
we are using.
-aneesh
Donet Tom <[email protected]> writes:
> commit bda420b98505 ("numa balancing: migrate on fault among multiple bound
> nodes") added support for migrate on protnone reference with MPOL_BIND
> memory policy. This allowed numa fault migration when the executing node
> is part of the policy mask for MPOL_BIND. This patch extends migration
> support to MPOL_PREFERRED_MANY policy.
>
> Currently, we cannot specify MPOL_PREFERRED_MANY with the mempolicy flag
> MPOL_F_NUMA_BALANCING. This causes issues when we want to use
> NUMA_BALANCING_MEMORY_TIERING. To effectively use the slow memory tier,
> the kernel should not allocate pages from the slower memory tier via
> allocation control zonelist fallback. Instead, we should move cold pages
> from the faster memory node via memory demotion. For a page allocation,
> kswapd is only woken up after we try to allocate pages from all nodes in
> the allocation zone list. This implies that, without using memory
> policies, we will end up allocating hot pages in the slower memory tier.
>
> MPOL_PREFERRED_MANY was added by commit b27abaccf8e8 ("mm/mempolicy: add
> MPOL_PREFERRED_MANY for multiple preferred nodes") to allow better
> allocation control when we have memory tiers in the system. With
> MPOL_PREFERRED_MANY, the user can use a policy node mask consisting only
> of faster memory nodes. When we fail to allocate pages from the faster
> memory node, kswapd would be woken up, allowing demotion of cold pages
> to slower memory nodes.
>
> With the current kernel, such usage of memory policies implies we can't
> do page promotion from a slower memory tier to a faster memory tier
> using numa fault. This patch fixes this issue.
>
> For MPOL_PREFERRED_MANY, if the executing node is in the policy node
> mask, we allow numa migration to the executing nodes. If the executing
> node is not in the policy node mask but the folio is already allocated
> based on policy preference (the folio node is in the policy node mask),
> we don't allow numa migration. If both the executing node and folio node
> are outside the policy node mask, we allow numa migration to the
> executing nodes.
>
> Signed-off-by: Aneesh Kumar K.V (IBM) <[email protected]>
> Signed-off-by: Donet Tom <[email protected]>
> ---
> mm/mempolicy.c | 28 ++++++++++++++++++++++++++--
> 1 file changed, 26 insertions(+), 2 deletions(-)
>
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 73d698e21dae..8c4c92b10371 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -1458,9 +1458,10 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
> if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
> return -EINVAL;
> if (*flags & MPOL_F_NUMA_BALANCING) {
> - if (*mode != MPOL_BIND)
> + if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
> + *flags |= (MPOL_F_MOF | MPOL_F_MORON);
> + else
> return -EINVAL;
> - *flags |= (MPOL_F_MOF | MPOL_F_MORON);
> }
> return 0;
> }
> @@ -2463,6 +2464,23 @@ static void sp_free(struct sp_node *n)
> kmem_cache_free(sn_cache, n);
> }
>
> +static inline bool mpol_preferred_should_numa_migrate(int exec_node, int folio_node,
> + struct mempolicy *pol)
> +{
> + /* if the executing node is in the policy node mask, migrate */
> + if (node_isset(exec_node, pol->nodes))
> + return true;
> +
> + /* If the folio node is in policy node mask, don't migrate */
> + if (node_isset(folio_node, pol->nodes))
> + return false;
> + /*
> + * both the folio node and executing node are outside the policy nodemask,
> + * migrate as normal numa fault migration.
> + */
> + return true;
Why? This may cause some unexpected result. For example, pages may be
distributed among multiple sockets unexpectedly. So, I prefer the more
conservative policy, that is, only migrate if this node is in
pol->nodes.
--
Best Regards,
Huang, Ying
> +}
> +
> /**
> * mpol_misplaced - check whether current folio node is valid in policy
> *
> @@ -2526,6 +2544,12 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
> break;
>
> case MPOL_PREFERRED_MANY:
> + if (pol->flags & MPOL_F_MORON) {
> + if (!mpol_preferred_should_numa_migrate(thisnid, curnid, pol))
> + goto out;
> + break;
> + }
> +
> /*
> * use current page if in policy nodemask,
> * else select nearest allowed node, if any.
"Aneesh Kumar K.V" <[email protected]> writes:
> On 2/20/24 12:02 PM, Aneesh Kumar K.V wrote:
>> On 2/20/24 11:55 AM, Huang, Ying wrote:
>>> "Aneesh Kumar K.V" <[email protected]> writes:
>>>
>>>> On 2/20/24 6:51 AM, Andrew Morton wrote:
>>>>> On Mon, 19 Feb 2024 14:04:23 +0530 Donet Tom <[email protected]> wrote:
>>>>>
>>>>>>>> --- a/mm/mempolicy.c
>>>>>>>> +++ b/mm/mempolicy.c
>>>>>>>> @@ -2526,7 +2526,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
>>>>>>>> if (node_isset(curnid, pol->nodes))
>>>>>>>> goto out;
>>>>>>>> z = first_zones_zonelist(
>>>>>>>> - node_zonelist(numa_node_id(), GFP_HIGHUSER),
>>>>>>>> + node_zonelist(thisnid, GFP_HIGHUSER),
>>>>>>>> gfp_zone(GFP_HIGHUSER),
>>>>>>>> &pol->nodes);
>>>>>>>> polnid = zone_to_nid(z->zone);
>>>>>>> int thisnid = cpu_to_node(thiscpu);
>>>>>>>
>>>>>>> Is there any dofference between numa_node_id() and
>>>>>>> cpu_to_node(raw_smp_processor_id())? And it it explicable that we're
>>>>>>> using one here and not the other?
>>>>>>
>>>>>> Hi Andrew
>>>>>>
>>>>>> Both numa_node_id() and cpu_to_node(raw_smp_processor_id()) return the current execution node id,
>>>>>> Since the current execution node is already fetched at the beginning (thisnid) we can reuse it instead of getting it again.
>>>>>
>>>>> Sure, but mine was a broader thought: why do we have both? Is one
>>>>> preferable and if so why?
>>>>
>>>> IIUC these are two helpers to fetch current numa node id. and either of them can be used based on need. The default implementation shows the details.
>>>> (One small difference is numa_node_id() can use optimized per cpu reader because it is fetching the per cpu variable of the currently running cpu.)
>>>>
>>>> #ifndef numa_node_id
>>>> /* Returns the number of the current Node. */
>>>> static inline int numa_node_id(void)
>>>> {
>>>> return raw_cpu_read(numa_node);
>>>> }
>>>> #endif
>>>>
>>>> #ifndef cpu_to_node
>>>> static inline int cpu_to_node(int cpu)
>>>> {
>>>> return per_cpu(numa_node, cpu);
>>>> }
>>>> #endif
>>>>
>>>> In mpol_misplaced function, we need the cpu details because we are using that in other place (should_numa_migreate_memory()). So it makes it easy
>>>> to use cpu_to_node(thiscpu) instead of numa_node_id().
>>>
>>> IIUC, numa_node_id() is faster than cpu_to_node(thiscpu), even if we
>>> have thiscpu already. cpu_to_node() is mainly used to get the node of
>>> NOT current CPU. So, IMHO, we should only use numa_node_id() in this
>>> function.
>>>
>>
>> This change?
>>
>> modified mm/mempolicy.c
>> @@ -2502,8 +2502,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
>> pgoff_t ilx;
>> struct zoneref *z;
>> int curnid = folio_nid(folio);
>> - int thiscpu = raw_smp_processor_id();
>> - int thisnid = cpu_to_node(thiscpu);
>> + int thisnid = numa_node_id();
>> int polnid = NUMA_NO_NODE;
>> int ret = NUMA_NO_NODE;
>>
>> @@ -2573,7 +2572,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
>> polnid = thisnid;
>>
>> if (!should_numa_migrate_memory(current, folio, curnid,
>> - thiscpu))
>> + raw_smp_processor_id()))
>> goto out;
>> }
>>
>>
>
> One of the problem with the above change will be the need to make sure smp processor id remaining stable, which
> I am not sure we want in this function. With that we can end up with processor id not related to the numa node id
> we are using.
This isn't an issue now, because mpol_misplaced() are always called with
PTL held. And, we can still keep thiscpu local variable.
--
Best Regards,
Huang, Ying
"Aneesh Kumar K.V" <[email protected]> writes:
> On 2/20/24 12:06 PM, Huang, Ying wrote:
>> Donet Tom <[email protected]> writes:
>>
>>> On 2/19/24 17:37, Michal Hocko wrote:
>>>> On Sat 17-02-24 01:31:35, Donet Tom wrote:
>>>>> commit bda420b98505 ("numa balancing: migrate on fault among multiple bound
>>>>> nodes") added support for migrate on protnone reference with MPOL_BIND
>>>>> memory policy. This allowed numa fault migration when the executing node
>>>>> is part of the policy mask for MPOL_BIND. This patch extends migration
>>>>> support to MPOL_PREFERRED_MANY policy.
>>>>>
>>>>> Currently, we cannot specify MPOL_PREFERRED_MANY with the mempolicy flag
>>>>> MPOL_F_NUMA_BALANCING. This causes issues when we want to use
>>>>> NUMA_BALANCING_MEMORY_TIERING. To effectively use the slow memory tier,
>>>>> the kernel should not allocate pages from the slower memory tier via
>>>>> allocation control zonelist fallback. Instead, we should move cold pages
>>>>> from the faster memory node via memory demotion. For a page allocation,
>>>>> kswapd is only woken up after we try to allocate pages from all nodes in
>>>>> the allocation zone list. This implies that, without using memory
>>>>> policies, we will end up allocating hot pages in the slower memory tier.
>>>>>
>>>>> MPOL_PREFERRED_MANY was added by commit b27abaccf8e8 ("mm/mempolicy: add
>>>>> MPOL_PREFERRED_MANY for multiple preferred nodes") to allow better
>>>>> allocation control when we have memory tiers in the system. With
>>>>> MPOL_PREFERRED_MANY, the user can use a policy node mask consisting only
>>>>> of faster memory nodes. When we fail to allocate pages from the faster
>>>>> memory node, kswapd would be woken up, allowing demotion of cold pages
>>>>> to slower memory nodes.
>>>>>
>>>>> With the current kernel, such usage of memory policies implies we can't
>>>>> do page promotion from a slower memory tier to a faster memory tier
>>>>> using numa fault. This patch fixes this issue.
>>>>>
>>>>> For MPOL_PREFERRED_MANY, if the executing node is in the policy node
>>>>> mask, we allow numa migration to the executing nodes. If the executing
>>>>> node is not in the policy node mask but the folio is already allocated
>>>>> based on policy preference (the folio node is in the policy node mask),
>>>>> we don't allow numa migration. If both the executing node and folio node
>>>>> are outside the policy node mask, we allow numa migration to the
>>>>> executing nodes.
>>>> The feature makes sense to me. How has this been tested? Do you have any
>>>> numbers to present?
>>>
>>> Hi Michal
>>>
>>> I have a test program which allocate memory on a specified node and
>>> trigger the promotion or migration (Keep accessing the pages).
>>>
>>> Without this patch if we set MPOL_PREFERRED_MANY promotion or migration was not happening
>>> with this patch I could see pages are getting migrated or promoted.
>>>
>>> My system has 2 CPU+DRAM node (Tier 1) and 1 PMEM node(Tier 2). Below
>>> are my test results.
>>>
>>> In below table N0 and N1 are Tier1 Nodes. N6 is the Tier2 Node.
>>> Exec_Node is the execution node, Policy is the nodes in nodemask and
>>> "Curr Location Pages" is the node where pages present before migration
>>> or promotion start.
>>>
>>> Tests Results
>>> ------------------
>>> Scenario 1: if the executing node is in the policy node mask
>>> ================================================================================
>>> Exec_Node Policy Curr Location Pages Observations
>>> ================================================================================
>>> N0 N0 N1 N6 N1 Pages Migrated from N1 to N0
>>> N0 N0 N1 N6 N6 Pages Promoted from N6 to N0
>>> N0 N0 N1 N1 Pages Migrated from N1 to N0
>>> N0 N0 N1 N6 Pages Promoted from N6 to N0
>>>
>>> Scenario 2: If the folio node is in policy node mask and Exec node not in policy node mask
>>> ================================================================================
>>> Exec_Node Policy Curr Location Pages Observations
>>> ================================================================================
>>> N0 N1 N6 N1 Pages are not Migrating to N0
>>> N0 N1 N6 N6 Pages are not migration to N0
>>> N0 N1 N1 Pages are not Migrating to N0
>>>
>>> Scenario 3: both the folio node and executing node are outside the policy nodemask
>>> ==============================================================================
>>> Exec_Node Policy Curr Location Pages Observations
>>> ==============================================================================
>>> N0 N1 N6 Pages Promoted from N6 to N0
>>> N0 N6 N1 Pages Migrated from N1 to N0
>>>
>>
>> Please use some benchmarks (e.g., redis + memtier) and show the
>> proc-vmstat stats and benchamrk score.
>
>
> Without this change numa fault migration is not supported with MPOL_PREFERRED_MANY
> policy. So there is no performance comparison with and without patch. W.rt effectiveness of numa
> fault migration, that is a different topic from this patch
IIUC, the goal of the patch is to optimize performance, right? If so,
the benchmark score will help justify the change.
--
Best Regards,
Huang, Ying
"Huang, Ying" <[email protected]> writes:
> "Aneesh Kumar K.V" <[email protected]> writes:
>
>> On 2/20/24 12:06 PM, Huang, Ying wrote:
>>> Donet Tom <[email protected]> writes:
>>>
>>>> On 2/19/24 17:37, Michal Hocko wrote:
>>>>> On Sat 17-02-24 01:31:35, Donet Tom wrote:
>>>>>> commit bda420b98505 ("numa balancing: migrate on fault among multiple bound
>>>>>> nodes") added support for migrate on protnone reference with MPOL_BIND
>>>>>> memory policy. This allowed numa fault migration when the executing node
>>>>>> is part of the policy mask for MPOL_BIND. This patch extends migration
>>>>>> support to MPOL_PREFERRED_MANY policy.
>>>>>>
>>>>>> Currently, we cannot specify MPOL_PREFERRED_MANY with the mempolicy flag
>>>>>> MPOL_F_NUMA_BALANCING. This causes issues when we want to use
>>>>>> NUMA_BALANCING_MEMORY_TIERING. To effectively use the slow memory tier,
>>>>>> the kernel should not allocate pages from the slower memory tier via
>>>>>> allocation control zonelist fallback. Instead, we should move cold pages
>>>>>> from the faster memory node via memory demotion. For a page allocation,
>>>>>> kswapd is only woken up after we try to allocate pages from all nodes in
>>>>>> the allocation zone list. This implies that, without using memory
>>>>>> policies, we will end up allocating hot pages in the slower memory tier.
>>>>>>
>>>>>> MPOL_PREFERRED_MANY was added by commit b27abaccf8e8 ("mm/mempolicy: add
>>>>>> MPOL_PREFERRED_MANY for multiple preferred nodes") to allow better
>>>>>> allocation control when we have memory tiers in the system. With
>>>>>> MPOL_PREFERRED_MANY, the user can use a policy node mask consisting only
>>>>>> of faster memory nodes. When we fail to allocate pages from the faster
>>>>>> memory node, kswapd would be woken up, allowing demotion of cold pages
>>>>>> to slower memory nodes.
>>>>>>
>>>>>> With the current kernel, such usage of memory policies implies we can't
>>>>>> do page promotion from a slower memory tier to a faster memory tier
>>>>>> using numa fault. This patch fixes this issue.
>>>>>>
>>>>>> For MPOL_PREFERRED_MANY, if the executing node is in the policy node
>>>>>> mask, we allow numa migration to the executing nodes. If the executing
>>>>>> node is not in the policy node mask but the folio is already allocated
>>>>>> based on policy preference (the folio node is in the policy node mask),
>>>>>> we don't allow numa migration. If both the executing node and folio node
>>>>>> are outside the policy node mask, we allow numa migration to the
>>>>>> executing nodes.
>>>>> The feature makes sense to me. How has this been tested? Do you have any
>>>>> numbers to present?
>>>>
>>>> Hi Michal
>>>>
>>>> I have a test program which allocate memory on a specified node and
>>>> trigger the promotion or migration (Keep accessing the pages).
>>>>
>>>> Without this patch if we set MPOL_PREFERRED_MANY promotion or migration was not happening
>>>> with this patch I could see pages are getting migrated or promoted.
>>>>
>>>> My system has 2 CPU+DRAM node (Tier 1) and 1 PMEM node(Tier 2). Below
>>>> are my test results.
>>>>
>>>> In below table N0 and N1 are Tier1 Nodes. N6 is the Tier2 Node.
>>>> Exec_Node is the execution node, Policy is the nodes in nodemask and
>>>> "Curr Location Pages" is the node where pages present before migration
>>>> or promotion start.
>>>>
>>>> Tests Results
>>>> ------------------
>>>> Scenario 1: if the executing node is in the policy node mask
>>>> ================================================================================
>>>> Exec_Node Policy Curr Location Pages Observations
>>>> ================================================================================
>>>> N0 N0 N1 N6 N1 Pages Migrated from N1 to N0
>>>> N0 N0 N1 N6 N6 Pages Promoted from N6 to N0
>>>> N0 N0 N1 N1 Pages Migrated from N1 to N0
>>>> N0 N0 N1 N6 Pages Promoted from N6 to N0
>>>>
>>>> Scenario 2: If the folio node is in policy node mask and Exec node not in policy node mask
>>>> ================================================================================
>>>> Exec_Node Policy Curr Location Pages Observations
>>>> ================================================================================
>>>> N0 N1 N6 N1 Pages are not Migrating to N0
>>>> N0 N1 N6 N6 Pages are not migration to N0
>>>> N0 N1 N1 Pages are not Migrating to N0
>>>>
>>>> Scenario 3: both the folio node and executing node are outside the policy nodemask
>>>> ==============================================================================
>>>> Exec_Node Policy Curr Location Pages Observations
>>>> ==============================================================================
>>>> N0 N1 N6 Pages Promoted from N6 to N0
>>>> N0 N6 N1 Pages Migrated from N1 to N0
>>>>
>>>
>>> Please use some benchmarks (e.g., redis + memtier) and show the
>>> proc-vmstat stats and benchamrk score.
>>
>>
>> Without this change numa fault migration is not supported with MPOL_PREFERRED_MANY
>> policy. So there is no performance comparison with and without patch. W.r.t effectiveness of numa
>> fault migration, that is a different topic from this patch
>
> IIUC, the goal of the patch is to optimize performance, right? If so,
> the benchmark score will help justify the change.
>
The objective is to enable the use of the MPOL_PREFERRED_MANY policy,
which is essential for the correct functioning of memory demotion in
conjunction with memory promotion. Once we can use memory promotion, we
should be able to observe the same benefits as those provided by numa
fault memory promotion. The actual benefit of numa fault migration is
dependent on various factors such as the speed of the slower memory
device, the access pattern of the application, etc. We are discussing
its effectiveness and how to improve numa fault overhead in other
forums. However, we believe that this discussion should not hinder the
merging of this patch.
This change is similar to commit bda420b98505 ("numa balancing: migrate
on fault among multiple bound nodes")
-aneesh
"Huang, Ying" <[email protected]> writes:
> Donet Tom <[email protected]> writes:
>
>> commit bda420b98505 ("numa balancing: migrate on fault among multiple bound
>> nodes") added support for migrate on protnone reference with MPOL_BIND
>> memory policy. This allowed numa fault migration when the executing node
>> is part of the policy mask for MPOL_BIND. This patch extends migration
>> support to MPOL_PREFERRED_MANY policy.
>>
>> Currently, we cannot specify MPOL_PREFERRED_MANY with the mempolicy flag
>> MPOL_F_NUMA_BALANCING. This causes issues when we want to use
>> NUMA_BALANCING_MEMORY_TIERING. To effectively use the slow memory tier,
>> the kernel should not allocate pages from the slower memory tier via
>> allocation control zonelist fallback. Instead, we should move cold pages
>> from the faster memory node via memory demotion. For a page allocation,
>> kswapd is only woken up after we try to allocate pages from all nodes in
>> the allocation zone list. This implies that, without using memory
>> policies, we will end up allocating hot pages in the slower memory tier.
>>
>> MPOL_PREFERRED_MANY was added by commit b27abaccf8e8 ("mm/mempolicy: add
>> MPOL_PREFERRED_MANY for multiple preferred nodes") to allow better
>> allocation control when we have memory tiers in the system. With
>> MPOL_PREFERRED_MANY, the user can use a policy node mask consisting only
>> of faster memory nodes. When we fail to allocate pages from the faster
>> memory node, kswapd would be woken up, allowing demotion of cold pages
>> to slower memory nodes.
>>
>> With the current kernel, such usage of memory policies implies we can't
>> do page promotion from a slower memory tier to a faster memory tier
>> using numa fault. This patch fixes this issue.
>>
>> For MPOL_PREFERRED_MANY, if the executing node is in the policy node
>> mask, we allow numa migration to the executing nodes. If the executing
>> node is not in the policy node mask but the folio is already allocated
>> based on policy preference (the folio node is in the policy node mask),
>> we don't allow numa migration. If both the executing node and folio node
>> are outside the policy node mask, we allow numa migration to the
>> executing nodes.
>>
>> Signed-off-by: Aneesh Kumar K.V (IBM) <[email protected]>
>> Signed-off-by: Donet Tom <[email protected]>
>> ---
>> mm/mempolicy.c | 28 ++++++++++++++++++++++++++--
>> 1 file changed, 26 insertions(+), 2 deletions(-)
>>
>> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
>> index 73d698e21dae..8c4c92b10371 100644
>> --- a/mm/mempolicy.c
>> +++ b/mm/mempolicy.c
>> @@ -1458,9 +1458,10 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
>> if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
>> return -EINVAL;
>> if (*flags & MPOL_F_NUMA_BALANCING) {
>> - if (*mode != MPOL_BIND)
>> + if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
>> + *flags |= (MPOL_F_MOF | MPOL_F_MORON);
>> + else
>> return -EINVAL;
>> - *flags |= (MPOL_F_MOF | MPOL_F_MORON);
>> }
>> return 0;
>> }
>> @@ -2463,6 +2464,23 @@ static void sp_free(struct sp_node *n)
>> kmem_cache_free(sn_cache, n);
>> }
>>
>> +static inline bool mpol_preferred_should_numa_migrate(int exec_node, int folio_node,
>> + struct mempolicy *pol)
>> +{
>> + /* if the executing node is in the policy node mask, migrate */
>> + if (node_isset(exec_node, pol->nodes))
>> + return true;
>> +
>> + /* If the folio node is in policy node mask, don't migrate */
>> + if (node_isset(folio_node, pol->nodes))
>> + return false;
>> + /*
>> + * both the folio node and executing node are outside the policy nodemask,
>> + * migrate as normal numa fault migration.
>> + */
>> + return true;
>
> Why? This may cause some unexpected result. For example, pages may be
> distributed among multiple sockets unexpectedly. So, I prefer the more
> conservative policy, that is, only migrate if this node is in
> pol->nodes.
>
This will only have an impact if the user specifies
MPOL_F_NUMA_BALANCING. This means that the user is explicitly requesting
for frequently accessed memory pages to be migrated. Memory policy
MPOL_PREFERRED_MANY is able to allocate pages from nodes outside of
policy->nodes. For the specific use case that I am interested in, it
should be okay to restrict it to policy->nodes. However, I am wondering
if this is too restrictive given the definition of MPOL_PREFERRED_MANY.
-aneesh
Aneesh Kumar K.V <[email protected]> writes:
> "Huang, Ying" <[email protected]> writes:
>
>> Donet Tom <[email protected]> writes:
>>
>>> commit bda420b98505 ("numa balancing: migrate on fault among multiple bound
>>> nodes") added support for migrate on protnone reference with MPOL_BIND
>>> memory policy. This allowed numa fault migration when the executing node
>>> is part of the policy mask for MPOL_BIND. This patch extends migration
>>> support to MPOL_PREFERRED_MANY policy.
>>>
>>> Currently, we cannot specify MPOL_PREFERRED_MANY with the mempolicy flag
>>> MPOL_F_NUMA_BALANCING. This causes issues when we want to use
>>> NUMA_BALANCING_MEMORY_TIERING. To effectively use the slow memory tier,
>>> the kernel should not allocate pages from the slower memory tier via
>>> allocation control zonelist fallback. Instead, we should move cold pages
>>> from the faster memory node via memory demotion. For a page allocation,
>>> kswapd is only woken up after we try to allocate pages from all nodes in
>>> the allocation zone list. This implies that, without using memory
>>> policies, we will end up allocating hot pages in the slower memory tier.
>>>
>>> MPOL_PREFERRED_MANY was added by commit b27abaccf8e8 ("mm/mempolicy: add
>>> MPOL_PREFERRED_MANY for multiple preferred nodes") to allow better
>>> allocation control when we have memory tiers in the system. With
>>> MPOL_PREFERRED_MANY, the user can use a policy node mask consisting only
>>> of faster memory nodes. When we fail to allocate pages from the faster
>>> memory node, kswapd would be woken up, allowing demotion of cold pages
>>> to slower memory nodes.
>>>
>>> With the current kernel, such usage of memory policies implies we can't
>>> do page promotion from a slower memory tier to a faster memory tier
>>> using numa fault. This patch fixes this issue.
>>>
>>> For MPOL_PREFERRED_MANY, if the executing node is in the policy node
>>> mask, we allow numa migration to the executing nodes. If the executing
>>> node is not in the policy node mask but the folio is already allocated
>>> based on policy preference (the folio node is in the policy node mask),
>>> we don't allow numa migration. If both the executing node and folio node
>>> are outside the policy node mask, we allow numa migration to the
>>> executing nodes.
>>>
>>> Signed-off-by: Aneesh Kumar K.V (IBM) <[email protected]>
>>> Signed-off-by: Donet Tom <[email protected]>
>>> ---
>>> mm/mempolicy.c | 28 ++++++++++++++++++++++++++--
>>> 1 file changed, 26 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
>>> index 73d698e21dae..8c4c92b10371 100644
>>> --- a/mm/mempolicy.c
>>> +++ b/mm/mempolicy.c
>>> @@ -1458,9 +1458,10 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
>>> if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
>>> return -EINVAL;
>>> if (*flags & MPOL_F_NUMA_BALANCING) {
>>> - if (*mode != MPOL_BIND)
>>> + if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
>>> + *flags |= (MPOL_F_MOF | MPOL_F_MORON);
>>> + else
>>> return -EINVAL;
>>> - *flags |= (MPOL_F_MOF | MPOL_F_MORON);
>>> }
>>> return 0;
>>> }
>>> @@ -2463,6 +2464,23 @@ static void sp_free(struct sp_node *n)
>>> kmem_cache_free(sn_cache, n);
>>> }
>>>
>>> +static inline bool mpol_preferred_should_numa_migrate(int exec_node, int folio_node,
>>> + struct mempolicy *pol)
>>> +{
>>> + /* if the executing node is in the policy node mask, migrate */
>>> + if (node_isset(exec_node, pol->nodes))
>>> + return true;
>>> +
>>> + /* If the folio node is in policy node mask, don't migrate */
>>> + if (node_isset(folio_node, pol->nodes))
>>> + return false;
>>> + /*
>>> + * both the folio node and executing node are outside the policy nodemask,
>>> + * migrate as normal numa fault migration.
>>> + */
>>> + return true;
>>
>> Why? This may cause some unexpected result. For example, pages may be
>> distributed among multiple sockets unexpectedly. So, I prefer the more
>> conservative policy, that is, only migrate if this node is in
>> pol->nodes.
>>
>
> This will only have an impact if the user specifies
> MPOL_F_NUMA_BALANCING. This means that the user is explicitly requesting
> for frequently accessed memory pages to be migrated. Memory policy
> MPOL_PREFERRED_MANY is able to allocate pages from nodes outside of
> policy->nodes. For the specific use case that I am interested in, it
> should be okay to restrict it to policy->nodes. However, I am wondering
> if this is too restrictive given the definition of MPOL_PREFERRED_MANY.
IMHO, we can start with some consecutive way and expand it if it's
proved necessary.
--
Best Regards,
Huang, Ying
Aneesh Kumar K.V <[email protected]> writes:
> "Huang, Ying" <[email protected]> writes:
>
>> "Aneesh Kumar K.V" <[email protected]> writes:
>>
>>> On 2/20/24 12:06 PM, Huang, Ying wrote:
>>>> Donet Tom <[email protected]> writes:
>>>>
>>>>> On 2/19/24 17:37, Michal Hocko wrote:
>>>>>> On Sat 17-02-24 01:31:35, Donet Tom wrote:
>>>>>>> commit bda420b98505 ("numa balancing: migrate on fault among multiple bound
>>>>>>> nodes") added support for migrate on protnone reference with MPOL_BIND
>>>>>>> memory policy. This allowed numa fault migration when the executing node
>>>>>>> is part of the policy mask for MPOL_BIND. This patch extends migration
>>>>>>> support to MPOL_PREFERRED_MANY policy.
>>>>>>>
>>>>>>> Currently, we cannot specify MPOL_PREFERRED_MANY with the mempolicy flag
>>>>>>> MPOL_F_NUMA_BALANCING. This causes issues when we want to use
>>>>>>> NUMA_BALANCING_MEMORY_TIERING. To effectively use the slow memory tier,
>>>>>>> the kernel should not allocate pages from the slower memory tier via
>>>>>>> allocation control zonelist fallback. Instead, we should move cold pages
>>>>>>> from the faster memory node via memory demotion. For a page allocation,
>>>>>>> kswapd is only woken up after we try to allocate pages from all nodes in
>>>>>>> the allocation zone list. This implies that, without using memory
>>>>>>> policies, we will end up allocating hot pages in the slower memory tier.
>>>>>>>
>>>>>>> MPOL_PREFERRED_MANY was added by commit b27abaccf8e8 ("mm/mempolicy: add
>>>>>>> MPOL_PREFERRED_MANY for multiple preferred nodes") to allow better
>>>>>>> allocation control when we have memory tiers in the system. With
>>>>>>> MPOL_PREFERRED_MANY, the user can use a policy node mask consisting only
>>>>>>> of faster memory nodes. When we fail to allocate pages from the faster
>>>>>>> memory node, kswapd would be woken up, allowing demotion of cold pages
>>>>>>> to slower memory nodes.
>>>>>>>
>>>>>>> With the current kernel, such usage of memory policies implies we can't
>>>>>>> do page promotion from a slower memory tier to a faster memory tier
>>>>>>> using numa fault. This patch fixes this issue.
>>>>>>>
>>>>>>> For MPOL_PREFERRED_MANY, if the executing node is in the policy node
>>>>>>> mask, we allow numa migration to the executing nodes. If the executing
>>>>>>> node is not in the policy node mask but the folio is already allocated
>>>>>>> based on policy preference (the folio node is in the policy node mask),
>>>>>>> we don't allow numa migration. If both the executing node and folio node
>>>>>>> are outside the policy node mask, we allow numa migration to the
>>>>>>> executing nodes.
>>>>>> The feature makes sense to me. How has this been tested? Do you have any
>>>>>> numbers to present?
>>>>>
>>>>> Hi Michal
>>>>>
>>>>> I have a test program which allocate memory on a specified node and
>>>>> trigger the promotion or migration (Keep accessing the pages).
>>>>>
>>>>> Without this patch if we set MPOL_PREFERRED_MANY promotion or migration was not happening
>>>>> with this patch I could see pages are getting migrated or promoted.
>>>>>
>>>>> My system has 2 CPU+DRAM node (Tier 1) and 1 PMEM node(Tier 2). Below
>>>>> are my test results.
>>>>>
>>>>> In below table N0 and N1 are Tier1 Nodes. N6 is the Tier2 Node.
>>>>> Exec_Node is the execution node, Policy is the nodes in nodemask and
>>>>> "Curr Location Pages" is the node where pages present before migration
>>>>> or promotion start.
>>>>>
>>>>> Tests Results
>>>>> ------------------
>>>>> Scenario 1: if the executing node is in the policy node mask
>>>>> ================================================================================
>>>>> Exec_Node Policy Curr Location Pages Observations
>>>>> ================================================================================
>>>>> N0 N0 N1 N6 N1 Pages Migrated from N1 to N0
>>>>> N0 N0 N1 N6 N6 Pages Promoted from N6 to N0
>>>>> N0 N0 N1 N1 Pages Migrated from N1 to N0
>>>>> N0 N0 N1 N6 Pages Promoted from N6 to N0
>>>>>
>>>>> Scenario 2: If the folio node is in policy node mask and Exec node not in policy node mask
>>>>> ================================================================================
>>>>> Exec_Node Policy Curr Location Pages Observations
>>>>> ================================================================================
>>>>> N0 N1 N6 N1 Pages are not Migrating to N0
>>>>> N0 N1 N6 N6 Pages are not migration to N0
>>>>> N0 N1 N1 Pages are not Migrating to N0
>>>>>
>>>>> Scenario 3: both the folio node and executing node are outside the policy nodemask
>>>>> ==============================================================================
>>>>> Exec_Node Policy Curr Location Pages Observations
>>>>> ==============================================================================
>>>>> N0 N1 N6 Pages Promoted from N6 to N0
>>>>> N0 N6 N1 Pages Migrated from N1 to N0
>>>>>
>>>>
>>>> Please use some benchmarks (e.g., redis + memtier) and show the
>>>> proc-vmstat stats and benchamrk score.
>>>
>>>
>>> Without this change numa fault migration is not supported with MPOL_PREFERRED_MANY
>>> policy. So there is no performance comparison with and without patch. Wr.t effectiveness of numa
>>> fault migration, that is a different topic from this patch
>>
>> IIUC, the goal of the patch is to optimize performance, right? If so,
>> the benchmark score will help justify the change.
>>
>
> The objective is to enable the use of the MPOL_PREFERRED_MANY policy,
> which is essential for the correct functioning of memory demotion in
> conjunction with memory promotion. Once we can use memory promotion, we
> should be able to observe the same benefits as those provided by numa
> fault memory promotion. The actual benefit of numa fault migration is
> dependent on various factors such as the speed of the slower memory
> device, the access pattern of the application, etc. We are discussing
> its effectiveness and how to improve numa fault overhead in other
> forums. However, we believe that this discussion should not hinder the
> merging of this patch.
>
> This change is similar to commit bda420b98505 ("numa balancing: migrate
> on fault among multiple bound nodes")
We provide the performance data in the description of that commit :-)
--
Best Regards,
Huang, Ying
On Tue 20-02-24 09:27:25, Aneesh Kumar K.V wrote:
[...]
> case MPOL_PREFERRED_MANY:
> if (pol->flags & MPOL_F_MORON) {
> if (!mpol_preferred_should_numa_migrate(thisnid, curnid, pol))
> goto out;
> break;
> }
>
> /*
> * use current page if in policy nodemask,
> * else select nearest allowed node, if any.
> * If no allowed nodes, use current [!misplaced].
> */
> if (node_isset(curnid, pol->nodes))
> goto out;
> z = first_zones_zonelist(
> node_zonelist(thisnid, GFP_HIGHUSER),
> gfp_zone(GFP_HIGHUSER),
> &pol->nodes);
> polnid = zone_to_nid(z->zone);
> break;
> ....
> ..
> }
>
> /* Migrate the folio towards the node whose CPU is referencing it */
> if (pol->flags & MPOL_F_MORON) {
> polnid = thisnid;
>
> if (!should_numa_migrate_memory(current, folio, curnid,
> thiscpu))
> goto out;
> }
>
> if (curnid != polnid)
> ret = polnid;
> out:
> mpol_cond_put(pol);
>
> return ret;
> }
Ohh, right this code is confusing as hell. Thanks for the clarification.
With this in mind. There should be a comment warning about MPOL_F_MOF
always being unset as the userspace cannot really set it up.
Thanks!
--
Michal Hocko
SUSE Labs
On Tue 20-02-24 15:22:07, Huang, Ying wrote:
[...]
> This isn't an issue now, because mpol_misplaced() are always called with
> PTL held. And, we can still keep thiscpu local variable.
yes, this is the case but it would be better if we made that assumption
official by lockdep_assert_held
--
Michal Hocko
SUSE Labs
On 2/20/24 14:18, Michal Hocko wrote:
> On Tue 20-02-24 09:27:25, Aneesh Kumar K.V wrote:
> [...]
>> case MPOL_PREFERRED_MANY:
>> if (pol->flags & MPOL_F_MORON) {
>> if (!mpol_preferred_should_numa_migrate(thisnid, curnid, pol))
>> goto out;
>> break;
>> }
>>
>> /*
>> * use current page if in policy nodemask,
>> * else select nearest allowed node, if any.
>> * If no allowed nodes, use current [!misplaced].
>> */
>> if (node_isset(curnid, pol->nodes))
>> goto out;
>> z = first_zones_zonelist(
>> node_zonelist(thisnid, GFP_HIGHUSER),
>> gfp_zone(GFP_HIGHUSER),
>> &pol->nodes);
>> polnid = zone_to_nid(z->zone);
>> break;
>> ....
>> ..
>> }
>>
>> /* Migrate the folio towards the node whose CPU is referencing it */
>> if (pol->flags & MPOL_F_MORON) {
>> polnid = thisnid;
>>
>> if (!should_numa_migrate_memory(current, folio, curnid,
>> thiscpu))
>> goto out;
>> }
>>
>> if (curnid != polnid)
>> ret = polnid;
>> out:
>> mpol_cond_put(pol);
>>
>> return ret;
>> }
> Ohh, right this code is confusing as hell. Thanks for the clarification.
> With this in mind. There should be a comment warning about MPOL_F_MOF
> always being unset as the userspace cannot really set it up.
>
> Thanks!
>
Hi Michal
Sorry For the late reply.
If we set MPOL_F_NUMA_BALANCING from userspace then MPOL_F_MOF and MPOL_F_MORON flags will get set in kernel.
/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
{
*flags = *mode & MPOL_MODE_FLAGS;
*mode &= ~MPOL_MODE_FLAGS;
if ((unsigned int)(*mode) >= MPOL_MAX)
return -EINVAL;
if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
return -EINVAL;
if (*flags & MPOL_F_NUMA_BALANCING) {
if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
*flags |= (MPOL_F_MOF | MPOL_F_MORON);
else
return -EINVAL;
}
In current kernel it is supported only for MPOL_BIND and we added suppor for MPOL_PREFERRED_MANY also.
Why MPOL_F_MOF flag is required?
---------------------------------
For NUMA migration the process memory is unmapped by "task_numa_work" periodically, if unmapped memory got
accessed again then NUMA hinting page fault will occur and in page fault handler the pages get migrated.
If MPOL_F_MOF is not set then "task_numa_work" will not unmap the process pages and NUMA hinting page fault
and migration will not occur. This change has been introduced by commit
fc3147245d193b (mm: numa: Limit NUMA scanning to migrate-on-fault VMAs).
How new implementation works
----------------------------
MPOL_PREFERRED_MANY is able to set MPOL_F_MOF and MPOL_F_MORON through MPOL_F_NUMA_BALANCING. So NUMA hinting
page faults will occur. In mpol_misplaced if we can do numa migration, we select the currently executing node as the target node
otherwise we end up returning from the function with ret = NUMA_NO_NODE.
So since we are able to set MPOL_F_MOF from userspace through MPOL_F_NUMA_BALANCING, no need to add this comment right?
Thanks
Donet Tom
"Huang, Ying" <[email protected]> writes:
> Aneesh Kumar K.V <[email protected]> writes:
>
>> "Huang, Ying" <[email protected]> writes:
>>
>>> Donet Tom <[email protected]> writes:
>>>
>>>> commit bda420b98505 ("numa balancing: migrate on fault among multiple bound
>>>> nodes") added support for migrate on protnone reference with MPOL_BIND
>>>> memory policy. This allowed numa fault migration when the executing node
>>>> is part of the policy mask for MPOL_BIND. This patch extends migration
>>>> support to MPOL_PREFERRED_MANY policy.
>>>>
>>>> Currently, we cannot specify MPOL_PREFERRED_MANY with the mempolicy flag
>>>> MPOL_F_NUMA_BALANCING. This causes issues when we want to use
>>>> NUMA_BALANCING_MEMORY_TIERING. To effectively use the slow memory tier,
>>>> the kernel should not allocate pages from the slower memory tier via
>>>> allocation control zonelist fallback. Instead, we should move cold pages
>>>> from the faster memory node via memory demotion. For a page allocation,
>>>> kswapd is only woken up after we try to allocate pages from all nodes in
>>>> the allocation zone list. This implies that, without using memory
>>>> policies, we will end up allocating hot pages in the slower memory tier.
>>>>
>>>> MPOL_PREFERRED_MANY was added by commit b27abaccf8e8 ("mm/mempolicy: add
>>>> MPOL_PREFERRED_MANY for multiple preferred nodes") to allow better
>>>> allocation control when we have memory tiers in the system. With
>>>> MPOL_PREFERRED_MANY, the user can use a policy node mask consisting only
>>>> of faster memory nodes. When we fail to allocate pages from the faster
>>>> memory node, kswapd would be woken up, allowing demotion of cold pages
>>>> to slower memory nodes.
>>>>
>>>> With the current kernel, such usage of memory policies implies we can't
>>>> do page promotion from a slower memory tier to a faster memory tier
>>>> using numa fault. This patch fixes this issue.
>>>>
>>>> For MPOL_PREFERRED_MANY, if the executing node is in the policy node
>>>> mask, we allow numa migration to the executing nodes. If the executing
>>>> node is not in the policy node mask but the folio is already allocated
>>>> based on policy preference (the folio node is in the policy node mask),
>>>> we don't allow numa migration. If both the executing node and folio node
>>>> are outside the policy node mask, we allow numa migration to the
>>>> executing nodes.
>>>>
>>>> Signed-off-by: Aneesh Kumar K.V (IBM) <[email protected]>
>>>> Signed-off-by: Donet Tom <[email protected]>
>>>> ---
>>>> mm/mempolicy.c | 28 ++++++++++++++++++++++++++--
>>>> 1 file changed, 26 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
>>>> index 73d698e21dae..8c4c92b10371 100644
>>>> --- a/mm/mempolicy.c
>>>> +++ b/mm/mempolicy.c
>>>> @@ -1458,9 +1458,10 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
>>>> if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
>>>> return -EINVAL;
>>>> if (*flags & MPOL_F_NUMA_BALANCING) {
>>>> - if (*mode != MPOL_BIND)
>>>> + if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
>>>> + *flags |= (MPOL_F_MOF | MPOL_F_MORON);
>>>> + else
>>>> return -EINVAL;
>>>> - *flags |= (MPOL_F_MOF | MPOL_F_MORON);
>>>> }
>>>> return 0;
>>>> }
>>>> @@ -2463,6 +2464,23 @@ static void sp_free(struct sp_node *n)
>>>> kmem_cache_free(sn_cache, n);
>>>> }
>>>>
>>>> +static inline bool mpol_preferred_should_numa_migrate(int exec_node, int folio_node,
>>>> + struct mempolicy *pol)
>>>> +{
>>>> + /* if the executing node is in the policy node mask, migrate */
>>>> + if (node_isset(exec_node, pol->nodes))
>>>> + return true;
>>>> +
>>>> + /* If the folio node is in policy node mask, don't migrate */
>>>> + if (node_isset(folio_node, pol->nodes))
>>>> + return false;
>>>> + /*
>>>> + * both the folio node and executing node are outside the policy nodemask,
>>>> + * migrate as normal numa fault migration.
>>>> + */
>>>> + return true;
>>>
>>> Why? This may cause some unexpected result. For example, pages may be
>>> distributed among multiple sockets unexpectedly. So, I prefer the more
>>> conservative policy, that is, only migrate if this node is in
>>> pol->nodes.
>>>
>>
>> This will only have an impact if the user specifies
>> MPOL_F_NUMA_BALANCING. This means that the user is explicitly requesting
>> for frequently accessed memory pages to be migrated. Memory policy
>> MPOL_PREFERRED_MANY is able to allocate pages from nodes outside of
>> policy->nodes. For the specific use case that I am interested in, it
>> should be okay to restrict it to policy->nodes. However, I am wondering
>> if this is too restrictive given the definition of MPOL_PREFERRED_MANY.
>
> IMHO, we can start with some consecutive way and expand it if it's
> proved necessary.
>
Is this good?
1 file changed, 14 insertions(+), 34 deletions(-)
mm/mempolicy.c | 48 ++++++++++++++----------------------------------
modified mm/mempolicy.c
@@ -2464,23 +2464,6 @@ static void sp_free(struct sp_node *n)
kmem_cache_free(sn_cache, n);
}
-static inline bool mpol_preferred_should_numa_migrate(int exec_node, int folio_node,
- struct mempolicy *pol)
-{
- /* if the executing node is in the policy node mask, migrate */
- if (node_isset(exec_node, pol->nodes))
- return true;
-
- /* If the folio node is in policy node mask, don't migrate */
- if (node_isset(folio_node, pol->nodes))
- return false;
- /*
- * both the folio node and executing node are outside the policy nodemask,
- * migrate as normal numa fault migration.
- */
- return true;
-}
-
/**
* mpol_misplaced - check whether current folio node is valid in policy
*
@@ -2533,29 +2516,26 @@ int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
break;
case MPOL_BIND:
- /* Optimize placement among multiple nodes via NUMA balancing */
+ case MPOL_PREFERRED_MANY:
+ /*
+ * Even though MPOL_PREFERRED_MANY can allocate pages outside
+ * policy nodemask we don't allow numa migration to nodes
+ * outside policy nodemask for now. This is done so that if we
+ * want demotion to slow memory to happen, before allocating
+ * from some DRAM node say 'x', we will end up using a
+ * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
+ * we should not promote to node 'x' from slow memory node.
+ */
if (pol->flags & MPOL_F_MORON) {
+ /*
+ * Optimize placement among multiple nodes
+ * via NUMA balancing
+ */
if (node_isset(thisnid, pol->nodes))
break;
goto out;
}
- if (node_isset(curnid, pol->nodes))
- goto out;
- z = first_zones_zonelist(
- node_zonelist(thisnid, GFP_HIGHUSER),
- gfp_zone(GFP_HIGHUSER),
- &pol->nodes);
- polnid = zone_to_nid(z->zone);
- break;
-
- case MPOL_PREFERRED_MANY:
- if (pol->flags & MPOL_F_MORON) {
- if (!mpol_preferred_should_numa_migrate(thisnid, curnid, pol))
- goto out;
- break;
- }
-
/*
* use current page if in policy nodemask,
* else select nearest allowed node, if any.
[back]
.
Michal Hocko <[email protected]> writes:
> On Tue 20-02-24 15:22:07, Huang, Ying wrote:
> [...]
>> This isn't an issue now, because mpol_misplaced() are always called with
>> PTL held. And, we can still keep thiscpu local variable.
>
> yes, this is the case but it would be better if we made that assumption
> official by lockdep_assert_held
>
How about this folded into this patch?
2 files changed, 12 insertions(+), 4 deletions(-)
mm/memory.c | 6 ++++--
mm/mempolicy.c | 10 ++++++++--
modified mm/memory.c
@@ -4879,9 +4879,11 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
return ret;
}
-int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma,
+int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
unsigned long addr, int page_nid, int *flags)
{
+ struct vm_area_struct *vma = vmf->vma;
+
folio_get(folio);
/* Record the current PID acceesing VMA */
@@ -4893,7 +4895,7 @@ int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma,
*flags |= TNF_FAULT_LOCAL;
}
- return mpol_misplaced(folio, vma, addr);
+ return mpol_misplaced(folio, vmf, addr);
}
static vm_fault_t do_numa_page(struct vm_fault *vmf)
modified mm/mempolicy.c
@@ -2495,18 +2495,24 @@ static inline bool mpol_preferred_should_numa_migrate(int exec_node, int folio_n
* Return: NUMA_NO_NODE if the page is in a node that is valid for this
* policy, or a suitable node ID to allocate a replacement folio from.
*/
-int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
+int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
unsigned long addr)
{
struct mempolicy *pol;
pgoff_t ilx;
struct zoneref *z;
int curnid = folio_nid(folio);
+ struct vm_area_struct *vma = vmf->vma;
int thiscpu = raw_smp_processor_id();
- int thisnid = cpu_to_node(thiscpu);
+ int thisnid = numa_node_id();
int polnid = NUMA_NO_NODE;
int ret = NUMA_NO_NODE;
+ /*
+ * Make sure ptl is held so that we don't preempt and we
+ * have a stable smp processor id
+ */
+ lockdep_assert_held(vmf->ptl);
pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
if (!(pol->flags & MPOL_F_MOF))
goto out;
[back]
Aneesh Kumar K.V <[email protected]> writes:
> Michal Hocko <[email protected]> writes:
>
>> On Tue 20-02-24 15:22:07, Huang, Ying wrote:
>> [...]
>>> This isn't an issue now, because mpol_misplaced() are always called with
>>> PTL held. And, we can still keep thiscpu local variable.
>>
>> yes, this is the case but it would be better if we made that assumption
>> official by lockdep_assert_held
>>
>
> How about this folded into this patch?
>
> 2 files changed, 12 insertions(+), 4 deletions(-)
> mm/memory.c | 6 ++++--
> mm/mempolicy.c | 10 ++++++++--
>
> modified mm/memory.c
> @@ -4879,9 +4879,11 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
> return ret;
> }
>
> -int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma,
> +int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
> unsigned long addr, int page_nid, int *flags)
> {
> + struct vm_area_struct *vma = vmf->vma;
> +
> folio_get(folio);
>
> /* Record the current PID acceesing VMA */
> @@ -4893,7 +4895,7 @@ int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma,
> *flags |= TNF_FAULT_LOCAL;
> }
>
> - return mpol_misplaced(folio, vma, addr);
> + return mpol_misplaced(folio, vmf, addr);
> }
>
> static vm_fault_t do_numa_page(struct vm_fault *vmf)
> modified mm/mempolicy.c
> @@ -2495,18 +2495,24 @@ static inline bool mpol_preferred_should_numa_migrate(int exec_node, int folio_n
> * Return: NUMA_NO_NODE if the page is in a node that is valid for this
> * policy, or a suitable node ID to allocate a replacement folio from.
> */
> -int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
> +int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
> unsigned long addr)
> {
> struct mempolicy *pol;
> pgoff_t ilx;
> struct zoneref *z;
> int curnid = folio_nid(folio);
> + struct vm_area_struct *vma = vmf->vma;
> int thiscpu = raw_smp_processor_id();
> - int thisnid = cpu_to_node(thiscpu);
> + int thisnid = numa_node_id();
> int polnid = NUMA_NO_NODE;
> int ret = NUMA_NO_NODE;
>
> + /*
> + * Make sure ptl is held so that we don't preempt and we
> + * have a stable smp processor id
> + */
> + lockdep_assert_held(vmf->ptl);
> pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
> if (!(pol->flags & MPOL_F_MOF))
> goto out;
>
> [back]
>
LGTM, Thanks!
--
Best Regards,
Huang, Ying
Aneesh Kumar K.V <[email protected]> writes:
> "Huang, Ying" <[email protected]> writes:
>
>> Aneesh Kumar K.V <[email protected]> writes:
>>
>>> "Huang, Ying" <[email protected]> writes:
>>>
>>>> Donet Tom <[email protected]> writes:
>>>>
>>>>> commit bda420b98505 ("numa balancing: migrate on fault among multiple bound
>>>>> nodes") added support for migrate on protnone reference with MPOL_BIND
>>>>> memory policy. This allowed numa fault migration when the executing node
>>>>> is part of the policy mask for MPOL_BIND. This patch extends migration
>>>>> support to MPOL_PREFERRED_MANY policy.
>>>>>
>>>>> Currently, we cannot specify MPOL_PREFERRED_MANY with the mempolicy flag
>>>>> MPOL_F_NUMA_BALANCING. This causes issues when we want to use
>>>>> NUMA_BALANCING_MEMORY_TIERING. To effectively use the slow memory tier,
>>>>> the kernel should not allocate pages from the slower memory tier via
>>>>> allocation control zonelist fallback. Instead, we should move cold pages
>>>>> from the faster memory node via memory demotion. For a page allocation,
>>>>> kswapd is only woken up after we try to allocate pages from all nodes in
>>>>> the allocation zone list. This implies that, without using memory
>>>>> policies, we will end up allocating hot pages in the slower memory tier.
>>>>>
>>>>> MPOL_PREFERRED_MANY was added by commit b27abaccf8e8 ("mm/mempolicy: add
>>>>> MPOL_PREFERRED_MANY for multiple preferred nodes") to allow better
>>>>> allocation control when we have memory tiers in the system. With
>>>>> MPOL_PREFERRED_MANY, the user can use a policy node mask consisting only
>>>>> of faster memory nodes. When we fail to allocate pages from the faster
>>>>> memory node, kswapd would be woken up, allowing demotion of cold pages
>>>>> to slower memory nodes.
>>>>>
>>>>> With the current kernel, such usage of memory policies implies we can't
>>>>> do page promotion from a slower memory tier to a faster memory tier
>>>>> using numa fault. This patch fixes this issue.
>>>>>
>>>>> For MPOL_PREFERRED_MANY, if the executing node is in the policy node
>>>>> mask, we allow numa migration to the executing nodes. If the executing
>>>>> node is not in the policy node mask but the folio is already allocated
>>>>> based on policy preference (the folio node is in the policy node mask),
>>>>> we don't allow numa migration. If both the executing node and folio node
>>>>> are outside the policy node mask, we allow numa migration to the
>>>>> executing nodes.
>>>>>
>>>>> Signed-off-by: Aneesh Kumar K.V (IBM) <[email protected]>
>>>>> Signed-off-by: Donet Tom <[email protected]>
>>>>> ---
>>>>> mm/mempolicy.c | 28 ++++++++++++++++++++++++++--
>>>>> 1 file changed, 26 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
>>>>> index 73d698e21dae..8c4c92b10371 100644
>>>>> --- a/mm/mempolicy.c
>>>>> +++ b/mm/mempolicy.c
>>>>> @@ -1458,9 +1458,10 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
>>>>> if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
>>>>> return -EINVAL;
>>>>> if (*flags & MPOL_F_NUMA_BALANCING) {
>>>>> - if (*mode != MPOL_BIND)
>>>>> + if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
>>>>> + *flags |= (MPOL_F_MOF | MPOL_F_MORON);
>>>>> + else
>>>>> return -EINVAL;
>>>>> - *flags |= (MPOL_F_MOF | MPOL_F_MORON);
>>>>> }
>>>>> return 0;
>>>>> }
>>>>> @@ -2463,6 +2464,23 @@ static void sp_free(struct sp_node *n)
>>>>> kmem_cache_free(sn_cache, n);
>>>>> }
>>>>>
>>>>> +static inline bool mpol_preferred_should_numa_migrate(int exec_node, int folio_node,
>>>>> + struct mempolicy *pol)
>>>>> +{
>>>>> + /* if the executing node is in the policy node mask, migrate */
>>>>> + if (node_isset(exec_node, pol->nodes))
>>>>> + return true;
>>>>> +
>>>>> + /* If the folio node is in policy node mask, don't migrate */
>>>>> + if (node_isset(folio_node, pol->nodes))
>>>>> + return false;
>>>>> + /*
>>>>> + * both the folio node and executing node are outside the policy nodemask,
>>>>> + * migrate as normal numa fault migration.
>>>>> + */
>>>>> + return true;
>>>>
>>>> Why? This may cause some unexpected result. For example, pages may be
>>>> distributed among multiple sockets unexpectedly. So, I prefer the more
>>>> conservative policy, that is, only migrate if this node is in
>>>> pol->nodes.
>>>>
>>>
>>> This will only have an impact if the user specifies
>>> MPOL_F_NUMA_BALANCING. This means that the user is explicitly requesting
>>> for frequently accessed memory pages to be migrated. Memory policy
>>> MPOL_PREFERRED_MANY is able to allocate pages from nodes outside of
>>> policy->nodes. For the specific use case that I am interested in, it
>>> should be okay to restrict it to policy->nodes. However, I am wondering
>>> if this is too restrictive given the definition of MPOL_PREFERRED_MANY.
>>
>> IMHO, we can start with some consecutive way and expand it if it's
>> proved necessary.
>>
>
> Is this good?
>
> 1 file changed, 14 insertions(+), 34 deletions(-)
> mm/mempolicy.c | 48 ++++++++++++++----------------------------------
>
> modified mm/mempolicy.c
> @@ -2464,23 +2464,6 @@ static void sp_free(struct sp_node *n)
> kmem_cache_free(sn_cache, n);
> }
>
> -static inline bool mpol_preferred_should_numa_migrate(int exec_node, int folio_node,
> - struct mempolicy *pol)
> -{
> - /* if the executing node is in the policy node mask, migrate */
> - if (node_isset(exec_node, pol->nodes))
> - return true;
> -
> - /* If the folio node is in policy node mask, don't migrate */
> - if (node_isset(folio_node, pol->nodes))
> - return false;
> - /*
> - * both the folio node and executing node are outside the policy nodemask,
> - * migrate as normal numa fault migration.
> - */
> - return true;
> -}
> -
> /**
> * mpol_misplaced - check whether current folio node is valid in policy
> *
> @@ -2533,29 +2516,26 @@ int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
> break;
>
> case MPOL_BIND:
> - /* Optimize placement among multiple nodes via NUMA balancing */
> + case MPOL_PREFERRED_MANY:
> + /*
> + * Even though MPOL_PREFERRED_MANY can allocate pages outside
> + * policy nodemask we don't allow numa migration to nodes
> + * outside policy nodemask for now. This is done so that if we
> + * want demotion to slow memory to happen, before allocating
> + * from some DRAM node say 'x', we will end up using a
> + * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
> + * we should not promote to node 'x' from slow memory node.
> + */
> if (pol->flags & MPOL_F_MORON) {
> + /*
> + * Optimize placement among multiple nodes
> + * via NUMA balancing
> + */
> if (node_isset(thisnid, pol->nodes))
> break;
> goto out;
> }
>
> - if (node_isset(curnid, pol->nodes))
> - goto out;
> - z = first_zones_zonelist(
> - node_zonelist(thisnid, GFP_HIGHUSER),
> - gfp_zone(GFP_HIGHUSER),
> - &pol->nodes);
> - polnid = zone_to_nid(z->zone);
> - break;
IMO, the above deletion should be put in another patch?
--
Best Regards,
Huang, Ying
> -
> - case MPOL_PREFERRED_MANY:
> - if (pol->flags & MPOL_F_MORON) {
> - if (!mpol_preferred_should_numa_migrate(thisnid, curnid, pol))
> - goto out;
> - break;
> - }
> -
> /*
> * use current page if in policy nodemask,
> * else select nearest allowed node, if any.
>
> [back]
> .