Abhishek reported that after patch [1], hotplug operations are
taking ~double the expected time. [2]
The reason behind is that the CPU callbacks that migrate_on_reclaim_init()
sets always call set_migration_target_nodes() whenever a CPU is brought
up/down.
But we only care about numa nodes going from having cpus to become
cpuless, and vice versa, as that influences the demotion_target order.
We do already have two CPU callbacks (vmstat_cpu_online() and vmstat_cpu_dead())
that check exactly that, so get rid of the CPU callbacks in
migrate_on_reclaim_init() and only call set_migration_target_nodes() from
vmstat_cpu_{dead,online}() whenever a numa node change its N_CPU state.
[1] https://lore.kernel.org/linux-mm/[email protected]/
[2] https://lore.kernel.org/linux-mm/[email protected]/
Fixes: 884a6e5d1f93b ("mm/migrate: update node demotion order on hotplug events")
Reviewed-by: Baolin Wang <[email protected]>
Tested-by: Baolin Wang <[email protected]>
Reported-by: Abhishek Goel <[email protected]>
Signed-off-by: Oscar Salvador <[email protected]>
---
v2 -> v3:
- Add feedback from Huang Ying
- Add tags
v1 -> v2:
- Add fedback from Huang Ying
- Add feedback from Baolin Wang
---
include/linux/migrate.h | 8 +++++++
mm/migrate.c | 47 +++++++++--------------------------------
mm/vmstat.c | 13 +++++++++++-
3 files changed, 30 insertions(+), 38 deletions(-)
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index db96e10eb8da..90e75d5a54d6 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -48,7 +48,15 @@ int folio_migrate_mapping(struct address_space *mapping,
struct folio *newfolio, struct folio *folio, int extra_count);
extern bool numa_demotion_enabled;
+extern void migrate_on_reclaim_init(void);
+#ifdef CONFIG_HOTPLUG_CPU
+extern void set_migration_target_nodes(void);
#else
+static inline void set_migration_target_nodes(void) {}
+#endif
+#else
+
+static inline void set_migration_target_nodes(void) {}
static inline void putback_movable_pages(struct list_head *l) {}
static inline int migrate_pages(struct list_head *l, new_page_t new,
diff --git a/mm/migrate.c b/mm/migrate.c
index e8a6933af68d..2561881f03b2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -3193,7 +3193,7 @@ static void __set_migration_target_nodes(void)
/*
* For callers that do not hold get_online_mems() already.
*/
-static void set_migration_target_nodes(void)
+void set_migration_target_nodes(void)
{
get_online_mems();
__set_migration_target_nodes();
@@ -3257,51 +3257,24 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
return notifier_from_errno(0);
}
-/*
- * React to hotplug events that might affect the migration targets
- * like events that online or offline NUMA nodes.
- *
- * The ordering is also currently dependent on which nodes have
- * CPUs. That means we need CPU on/offline notification too.
- */
-static int migration_online_cpu(unsigned int cpu)
-{
- set_migration_target_nodes();
- return 0;
-}
-
-static int migration_offline_cpu(unsigned int cpu)
+void __init migrate_on_reclaim_init(void)
{
- set_migration_target_nodes();
- return 0;
-}
-
-static int __init migrate_on_reclaim_init(void)
-{
- int ret;
-
node_demotion = kmalloc_array(nr_node_ids,
sizeof(struct demotion_nodes),
GFP_KERNEL);
WARN_ON(!node_demotion);
- ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
- NULL, migration_offline_cpu);
+ hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
/*
- * In the unlikely case that this fails, the automatic
- * migration targets may become suboptimal for nodes
- * where N_CPU changes. With such a small impact in a
- * rare case, do not bother trying to do anything special.
+ * At this point, all numa nodes with memory/CPus have their state
+ * properly set, so we can build the demotion order now.
+ * Let us hold the cpu_hotplug lock just, as we could possibily have
+ * CPU hotplug events during boot.
*/
- WARN_ON(ret < 0);
- ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
- migration_online_cpu, NULL);
- WARN_ON(ret < 0);
-
- hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
- return 0;
+ cpus_read_lock();
+ set_migration_target_nodes();
+ cpus_read_unlock();
}
-late_initcall(migrate_on_reclaim_init);
#endif /* CONFIG_HOTPLUG_CPU */
bool numa_demotion_enabled = false;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4057372745d0..9e9536df51b5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -28,6 +28,7 @@
#include <linux/mm_inline.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
+#include <linux/migrate.h>
#include "internal.h"
@@ -2043,7 +2044,12 @@ static void __init init_cpu_node_state(void)
static int vmstat_cpu_online(unsigned int cpu)
{
refresh_zone_stat_thresholds();
- node_set_state(cpu_to_node(cpu), N_CPU);
+
+ if (!node_state(cpu_to_node(cpu), N_CPU)) {
+ node_set_state(cpu_to_node(cpu), N_CPU);
+ set_migration_target_nodes();
+ }
+
return 0;
}
@@ -2066,6 +2072,8 @@ static int vmstat_cpu_dead(unsigned int cpu)
return 0;
node_clear_state(node, N_CPU);
+ set_migration_target_nodes();
+
return 0;
}
@@ -2097,6 +2105,9 @@ void __init init_mm_internals(void)
start_shepherd_timer();
#endif
+#if defined(CONFIG_MIGRATION) && defined(CONFIG_HOTPLUG_CPU)
+ migrate_on_reclaim_init();
+#endif
#ifdef CONFIG_PROC_FS
proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
--
2.34.1
On Tue, Mar 15, 2022 at 02:40:53PM +0800, Huang, Ying wrote:
> Oscar Salvador <[email protected]> writes:
> CPUHP_MM_DEMOTION_DEAD and CPUHP_AP_MM_DEMOTION_ONLINE needs to be
> deleted from include/linux/cpuhotplug.h too.
Hi Huang Ying,
Right.
Andrew, can you apply this on top? Thanks
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 411a428ace4d..8a9a2d01b7c6 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -72,8 +72,6 @@ enum cpuhp_state {
CPUHP_SLUB_DEAD,
CPUHP_DEBUG_OBJ_DEAD,
CPUHP_MM_WRITEBACK_DEAD,
- /* Must be after CPUHP_MM_VMSTAT_DEAD */
- CPUHP_MM_DEMOTION_DEAD,
CPUHP_MM_VMSTAT_DEAD,
CPUHP_SOFTIRQ_DEAD,
CPUHP_NET_MVNETA_DEAD,
@@ -244,8 +242,6 @@ enum cpuhp_state {
CPUHP_AP_BASE_CACHEINFO_ONLINE,
CPUHP_AP_ONLINE_DYN,
CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30,
- /* Must be after CPUHP_AP_ONLINE_DYN for node_states[N_CPU] update */
- CPUHP_AP_MM_DEMOTION_ONLINE,
CPUHP_AP_X86_HPET_ONLINE,
CPUHP_AP_X86_KVM_CLK_ONLINE,
CPUHP_AP_ACTIVE,
Huang Ying, it would be great to have your Reviewed-by/Acked-by if you
are ok with the outome.
Thanks
--
Oscar Salvador
SUSE Labs
On 14/03/22 20:39, Oscar Salvador wrote:
> Abhishek reported that after patch [1], hotplug operations are
> taking ~double the expected time. [2]
>
> The reason behind is that the CPU callbacks that migrate_on_reclaim_init()
> sets always call set_migration_target_nodes() whenever a CPU is brought
> up/down.
> But we only care about numa nodes going from having cpus to become
> cpuless, and vice versa, as that influences the demotion_target order.
>
> We do already have two CPU callbacks (vmstat_cpu_online() and vmstat_cpu_dead())
> that check exactly that, so get rid of the CPU callbacks in
> migrate_on_reclaim_init() and only call set_migration_target_nodes() from
> vmstat_cpu_{dead,online}() whenever a numa node change its N_CPU state.
>
> [1] https://lore.kernel.org/linux-mm/[email protected]/
> [2] https://lore.kernel.org/linux-mm/[email protected]/
>
> Fixes: 884a6e5d1f93b ("mm/migrate: update node demotion order on hotplug events")
> Reviewed-by: Baolin Wang <[email protected]>
> Tested-by: Baolin Wang <[email protected]>
> Reported-by: Abhishek Goel <[email protected]>
> Signed-off-by: Oscar Salvador <[email protected]>
> ---
> v2 -> v3:
> - Add feedback from Huang Ying
> - Add tags
> v1 -> v2:
> - Add fedback from Huang Ying
> - Add feedback from Baolin Wang
> ---
> include/linux/migrate.h | 8 +++++++
> mm/migrate.c | 47 +++++++++--------------------------------
> mm/vmstat.c | 13 +++++++++++-
> 3 files changed, 30 insertions(+), 38 deletions(-)
>
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> index db96e10eb8da..90e75d5a54d6 100644
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -48,7 +48,15 @@ int folio_migrate_mapping(struct address_space *mapping,
> struct folio *newfolio, struct folio *folio, int extra_count);
>
> extern bool numa_demotion_enabled;
> +extern void migrate_on_reclaim_init(void);
> +#ifdef CONFIG_HOTPLUG_CPU
> +extern void set_migration_target_nodes(void);
> #else
> +static inline void set_migration_target_nodes(void) {}
> +#endif
> +#else
> +
> +static inline void set_migration_target_nodes(void) {}
>
> static inline void putback_movable_pages(struct list_head *l) {}
> static inline int migrate_pages(struct list_head *l, new_page_t new,
> diff --git a/mm/migrate.c b/mm/migrate.c
> index e8a6933af68d..2561881f03b2 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -3193,7 +3193,7 @@ static void __set_migration_target_nodes(void)
> /*
> * For callers that do not hold get_online_mems() already.
> */
> -static void set_migration_target_nodes(void)
> +void set_migration_target_nodes(void)
> {
> get_online_mems();
> __set_migration_target_nodes();
> @@ -3257,51 +3257,24 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
> return notifier_from_errno(0);
> }
>
> -/*
> - * React to hotplug events that might affect the migration targets
> - * like events that online or offline NUMA nodes.
> - *
> - * The ordering is also currently dependent on which nodes have
> - * CPUs. That means we need CPU on/offline notification too.
> - */
> -static int migration_online_cpu(unsigned int cpu)
> -{
> - set_migration_target_nodes();
> - return 0;
> -}
> -
> -static int migration_offline_cpu(unsigned int cpu)
> +void __init migrate_on_reclaim_init(void)
> {
> - set_migration_target_nodes();
> - return 0;
> -}
> -
> -static int __init migrate_on_reclaim_init(void)
> -{
> - int ret;
> -
> node_demotion = kmalloc_array(nr_node_ids,
> sizeof(struct demotion_nodes),
> GFP_KERNEL);
> WARN_ON(!node_demotion);
>
> - ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
> - NULL, migration_offline_cpu);
> + hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
> /*
> - * In the unlikely case that this fails, the automatic
> - * migration targets may become suboptimal for nodes
> - * where N_CPU changes. With such a small impact in a
> - * rare case, do not bother trying to do anything special.
> + * At this point, all numa nodes with memory/CPus have their state
> + * properly set, so we can build the demotion order now.
> + * Let us hold the cpu_hotplug lock just, as we could possibily have
> + * CPU hotplug events during boot.
> */
> - WARN_ON(ret < 0);
> - ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
> - migration_online_cpu, NULL);
> - WARN_ON(ret < 0);
> -
> - hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
> - return 0;
> + cpus_read_lock();
> + set_migration_target_nodes();
> + cpus_read_unlock();
> }
> -late_initcall(migrate_on_reclaim_init);
> #endif /* CONFIG_HOTPLUG_CPU */
>
> bool numa_demotion_enabled = false;
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 4057372745d0..9e9536df51b5 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -28,6 +28,7 @@
> #include <linux/mm_inline.h>
> #include <linux/page_ext.h>
> #include <linux/page_owner.h>
> +#include <linux/migrate.h>
>
> #include "internal.h"
>
> @@ -2043,7 +2044,12 @@ static void __init init_cpu_node_state(void)
> static int vmstat_cpu_online(unsigned int cpu)
> {
> refresh_zone_stat_thresholds();
> - node_set_state(cpu_to_node(cpu), N_CPU);
> +
> + if (!node_state(cpu_to_node(cpu), N_CPU)) {
> + node_set_state(cpu_to_node(cpu), N_CPU);
> + set_migration_target_nodes();
> + }
> +
> return 0;
> }
>
> @@ -2066,6 +2072,8 @@ static int vmstat_cpu_dead(unsigned int cpu)
> return 0;
>
> node_clear_state(node, N_CPU);
> + set_migration_target_nodes();
> +
> return 0;
> }
>
> @@ -2097,6 +2105,9 @@ void __init init_mm_internals(void)
>
> start_shepherd_timer();
> #endif
> +#if defined(CONFIG_MIGRATION) && defined(CONFIG_HOTPLUG_CPU)
> + migrate_on_reclaim_init();
> +#endif
> #ifdef CONFIG_PROC_FS
> proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
> proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
Tested three different kernels on a ppc system for hotplug operation-
Kernel
version Time taken
1. 5.14.0(baseline kernel) 9.6s
2. 5.15.0(upstream kernel without this patch) 23.1s
3. 5.17.0-rc8+(upstream kernel with this patch) 11.3s
*Time taken= avg total time taken to switch from smt=8 to smt=1
I have used v5.15.0 kernel for 2nd experiment as the issue was
introduced between 5.14 and 5.15. Similar numbers are
observed for any subsequent upstream kernels without this fix patch.
Overall LGTM. Please feel free to add:
Tested by: Abhishek Goel <[email protected]>
--Abhishek
Oscar Salvador <[email protected]> writes:
> Abhishek reported that after patch [1], hotplug operations are
> taking ~double the expected time. [2]
>
> The reason behind is that the CPU callbacks that migrate_on_reclaim_init()
> sets always call set_migration_target_nodes() whenever a CPU is brought
> up/down.
> But we only care about numa nodes going from having cpus to become
> cpuless, and vice versa, as that influences the demotion_target order.
>
> We do already have two CPU callbacks (vmstat_cpu_online() and vmstat_cpu_dead())
> that check exactly that, so get rid of the CPU callbacks in
> migrate_on_reclaim_init() and only call set_migration_target_nodes() from
> vmstat_cpu_{dead,online}() whenever a numa node change its N_CPU state.
>
> [1] https://lore.kernel.org/linux-mm/[email protected]/
> [2] https://lore.kernel.org/linux-mm/[email protected]/
>
> Fixes: 884a6e5d1f93b ("mm/migrate: update node demotion order on hotplug events")
> Reviewed-by: Baolin Wang <[email protected]>
> Tested-by: Baolin Wang <[email protected]>
> Reported-by: Abhishek Goel <[email protected]>
> Signed-off-by: Oscar Salvador <[email protected]>
> ---
> v2 -> v3:
> - Add feedback from Huang Ying
> - Add tags
> v1 -> v2:
> - Add fedback from Huang Ying
> - Add feedback from Baolin Wang
> ---
> include/linux/migrate.h | 8 +++++++
> mm/migrate.c | 47 +++++++++--------------------------------
> mm/vmstat.c | 13 +++++++++++-
> 3 files changed, 30 insertions(+), 38 deletions(-)
>
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> index db96e10eb8da..90e75d5a54d6 100644
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -48,7 +48,15 @@ int folio_migrate_mapping(struct address_space *mapping,
> struct folio *newfolio, struct folio *folio, int extra_count);
>
> extern bool numa_demotion_enabled;
> +extern void migrate_on_reclaim_init(void);
> +#ifdef CONFIG_HOTPLUG_CPU
> +extern void set_migration_target_nodes(void);
> #else
> +static inline void set_migration_target_nodes(void) {}
> +#endif
> +#else
> +
> +static inline void set_migration_target_nodes(void) {}
>
> static inline void putback_movable_pages(struct list_head *l) {}
> static inline int migrate_pages(struct list_head *l, new_page_t new,
> diff --git a/mm/migrate.c b/mm/migrate.c
> index e8a6933af68d..2561881f03b2 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -3193,7 +3193,7 @@ static void __set_migration_target_nodes(void)
> /*
> * For callers that do not hold get_online_mems() already.
> */
> -static void set_migration_target_nodes(void)
> +void set_migration_target_nodes(void)
> {
> get_online_mems();
> __set_migration_target_nodes();
> @@ -3257,51 +3257,24 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
> return notifier_from_errno(0);
> }
>
> -/*
> - * React to hotplug events that might affect the migration targets
> - * like events that online or offline NUMA nodes.
> - *
> - * The ordering is also currently dependent on which nodes have
> - * CPUs. That means we need CPU on/offline notification too.
> - */
> -static int migration_online_cpu(unsigned int cpu)
> -{
> - set_migration_target_nodes();
> - return 0;
> -}
> -
> -static int migration_offline_cpu(unsigned int cpu)
> +void __init migrate_on_reclaim_init(void)
> {
> - set_migration_target_nodes();
> - return 0;
> -}
> -
> -static int __init migrate_on_reclaim_init(void)
> -{
> - int ret;
> -
> node_demotion = kmalloc_array(nr_node_ids,
> sizeof(struct demotion_nodes),
> GFP_KERNEL);
> WARN_ON(!node_demotion);
>
> - ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
CPUHP_MM_DEMOTION_DEAD and CPUHP_AP_MM_DEMOTION_ONLINE needs to be
deleted from include/linux/cpuhotplug.h too.
Best Regards,
Huang, Ying
> - NULL, migration_offline_cpu);
> + hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
> /*
> - * In the unlikely case that this fails, the automatic
> - * migration targets may become suboptimal for nodes
> - * where N_CPU changes. With such a small impact in a
> - * rare case, do not bother trying to do anything special.
> + * At this point, all numa nodes with memory/CPus have their state
> + * properly set, so we can build the demotion order now.
> + * Let us hold the cpu_hotplug lock just, as we could possibily have
> + * CPU hotplug events during boot.
> */
> - WARN_ON(ret < 0);
> - ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
> - migration_online_cpu, NULL);
> - WARN_ON(ret < 0);
> -
> - hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
> - return 0;
> + cpus_read_lock();
> + set_migration_target_nodes();
> + cpus_read_unlock();
> }
> -late_initcall(migrate_on_reclaim_init);
> #endif /* CONFIG_HOTPLUG_CPU */
>
> bool numa_demotion_enabled = false;
[snip]