On pseries systems, performing changes to a partition's affinity
can result in altering the nodes a CPU is assigned to the
current system. For example, some systems are subject to resource
balancing operations by the operator or control software. In such
environments, system CPUs may be in node 1 and 3 at boot, and be
moved to nodes 2, 3, and 5, for better performance.
The current implementation attempts to recognize such changes within
the powerpc-specific version of arch_update_cpu_topology to modify a
range of system data structures directly. However, some scheduler
data structures may be inaccessible, or the timing of a node change
may still lead to corruption or error in other modules (e.g. user
space) which do not receive notification of these changes.
This patch modifies the PRRN/VPHN topology update worker function to
recognize an affinity change for a CPU, and to perform a full DLPAR
remove and add of the CPU instead of dynamically changing its node
to resolve this issue.
[Based upon patch submission:
Subject: [PATCH] powerpc/pseries: Perform full re-add of CPU for topology update post-migration
From: Nathan Fontenot <[email protected]>
Date: Tue Oct 30 05:43:36 AEDT 2018
]
[Replace patch submission:
Subject: [PATCH] powerpc/topology: Update numa mask when cpu node mapping changes
From: Srikar Dronamraju <[email protected]>
Date: Wed Oct 10 15:24:46 AEDT 2018
]
Signed-off-by: Michael Bringmann <[email protected]>
---
Changes in v03:
-- Fixed under-scheduling of topo updates.
Changes in v02:
-- Reuse more of the previous implementation to reduce patch size
-- Replace former calls to numa_update_cpu_topology(false) by
topology_schedule_update
-- Make sure that we report topology changes back through
arch_update_cpu_topology
-- Fix problem observed in powerpc next kernel with updating
cpu_associativity_changes_mask in timer_topology_fn when both
prrn_enabled and vphn_enabled, and many extra CPUs are possible,
but not installed.
-- Fix problem with updating cpu_associativity_changes_mask when
VPHN associativity information does not arrive until after first
call to update topology occurs.
---
arch/powerpc/include/asm/topology.h | 7 +---
arch/powerpc/kernel/rtasd.c | 2 +
arch/powerpc/mm/numa.c | 69 +++++++++++++++++++++++------------
3 files changed, 48 insertions(+), 30 deletions(-)
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index f85e2b0..79505c3 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -42,7 +42,7 @@ static inline int pcibus_to_node(struct pci_bus *bus)
extern int sysfs_add_device_to_node(struct device *dev, int nid);
extern void sysfs_remove_device_from_node(struct device *dev, int nid);
-extern int numa_update_cpu_topology(bool cpus_locked);
+extern void topology_schedule_update(void);
static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node)
{
@@ -77,10 +77,7 @@ static inline void sysfs_remove_device_from_node(struct device *dev,
{
}
-static inline int numa_update_cpu_topology(bool cpus_locked)
-{
- return 0;
-}
+static inline void topology_schedule_update(void) {}
static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) {}
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
index 8a1746d..b1828de 100644
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -285,7 +285,7 @@ static void handle_prrn_event(s32 scope)
* the RTAS event.
*/
pseries_devicetree_update(-scope);
- numa_update_cpu_topology(false);
+ topology_schedule_update();
}
static void handle_rtas_event(const struct rtas_error_log *log)
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index ef6bdf1..a750ec0 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1077,6 +1077,9 @@ struct topology_update_data {
static void reset_topology_timer(void);
static int topology_timer_secs = 1;
static int topology_inited;
+static int topology_update_in_progress;
+static int topology_changed;
+static unsigned long topology_scans;
/*
* Change polling interval for associativity changes.
@@ -1297,9 +1300,9 @@ static int update_lookup_table(void *data)
* Update the node maps and sysfs entries for each cpu whose home node
* has changed. Returns 1 when the topology has changed, and 0 otherwise.
*
- * cpus_locked says whether we already hold cpu_hotplug_lock.
+ * readd_cpus: Also readd any CPUs that have changed affinity
*/
-int numa_update_cpu_topology(bool cpus_locked)
+static int numa_update_cpu_topology(bool readd_cpus)
{
unsigned int cpu, sibling, changed = 0;
struct topology_update_data *updates, *ud;
@@ -1307,7 +1310,8 @@ int numa_update_cpu_topology(bool cpus_locked)
struct device *dev;
int weight, new_nid, i = 0;
- if (!prrn_enabled && !vphn_enabled && topology_inited)
+ if ((!prrn_enabled && !vphn_enabled && topology_inited) ||
+ topology_update_in_progress)
return 0;
weight = cpumask_weight(&cpu_associativity_changes_mask);
@@ -1318,6 +1322,8 @@ int numa_update_cpu_topology(bool cpus_locked)
if (!updates)
return 0;
+ topology_update_in_progress = 1;
+
cpumask_clear(&updated_cpus);
for_each_cpu(cpu, &cpu_associativity_changes_mask) {
@@ -1339,16 +1345,21 @@ int numa_update_cpu_topology(bool cpus_locked)
new_nid = find_and_online_cpu_nid(cpu);
- if (new_nid == numa_cpu_lookup_table[cpu]) {
+ if ((new_nid == numa_cpu_lookup_table[cpu]) ||
+ !cpu_present(cpu)) {
cpumask_andnot(&cpu_associativity_changes_mask,
&cpu_associativity_changes_mask,
cpu_sibling_mask(cpu));
- dbg("Assoc chg gives same node %d for cpu%d\n",
+ if (cpu_present(cpu))
+ dbg("Assoc chg gives same node %d for cpu%d\n",
new_nid, cpu);
cpu = cpu_last_thread_sibling(cpu);
continue;
}
+ if (readd_cpus)
+ dlpar_cpu_readd(cpu);
+
for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
ud = &updates[i++];
ud->next = &updates[i];
@@ -1390,23 +1401,15 @@ int numa_update_cpu_topology(bool cpus_locked)
if (!cpumask_weight(&updated_cpus))
goto out;
- if (cpus_locked)
- stop_machine_cpuslocked(update_cpu_topology, &updates[0],
- &updated_cpus);
- else
- stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
+ stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
/*
* Update the numa-cpu lookup table with the new mappings, even for
* offline CPUs. It is best to perform this update from the stop-
* machine context.
*/
- if (cpus_locked)
- stop_machine_cpuslocked(update_lookup_table, &updates[0],
- cpumask_of(raw_smp_processor_id()));
- else
- stop_machine(update_lookup_table, &updates[0],
- cpumask_of(raw_smp_processor_id()));
+ stop_machine(update_lookup_table, &updates[0],
+ cpumask_of(raw_smp_processor_id()));
for (ud = &updates[0]; ud; ud = ud->next) {
unregister_cpu_under_node(ud->cpu, ud->old_nid);
@@ -1420,35 +1423,53 @@ int numa_update_cpu_topology(bool cpus_locked)
}
out:
+ topology_changed = changed;
+ topology_update_in_progress = 0;
kfree(updates);
return changed;
}
int arch_update_cpu_topology(void)
{
- return numa_update_cpu_topology(true);
+ int changed = topology_changed;
+
+ topology_changed = 0;
+ return changed;
}
static void topology_work_fn(struct work_struct *work)
{
- rebuild_sched_domains();
+ lock_device_hotplug();
+ if (numa_update_cpu_topology(true))
+ rebuild_sched_domains();
+ unlock_device_hotplug();
}
static DECLARE_WORK(topology_work, topology_work_fn);
-static void topology_schedule_update(void)
+void topology_schedule_update(void)
{
- schedule_work(&topology_work);
+ if (!topology_update_in_progress)
+ schedule_work(&topology_work);
}
static void topology_timer_fn(struct timer_list *unused)
{
+ bool sdo = false;
+
+ if (topology_scans < 1)
+ bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
+ nr_cpumask_bits);
+
if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
- topology_schedule_update();
- else if (vphn_enabled) {
+ sdo = true;
+ if (vphn_enabled) {
if (update_cpu_associativity_changes_mask() > 0)
- topology_schedule_update();
+ sdo = true;
reset_topology_timer();
}
+ if (sdo)
+ topology_schedule_update();
+ topology_scans++;
}
static struct timer_list topology_timer;
@@ -1561,7 +1582,7 @@ void __init shared_proc_topology_init(void)
if (lppaca_shared_proc(get_lppaca())) {
bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
nr_cpumask_bits);
- numa_update_cpu_topology(false);
+ topology_schedule_update();
}
}
Hi Michael,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on powerpc/next]
[also build test ERROR on v5.0-rc4 next-20190206]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Michael-Bringmann/powerpc-numa-Perform-full-re-add-of-CPU-for-PRRN-VPHN-topology-update/20190207-101545
base: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-allyesconfig (attached as .config)
compiler: powerpc64-linux-gnu-gcc (Debian 8.2.0-11) 8.2.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=8.2.0 make.cross ARCH=powerpc
All errors (new ones prefixed by >>):
arch/powerpc/mm/numa.c: In function 'numa_update_cpu_topology':
>> arch/powerpc/mm/numa.c:1361:4: error: implicit declaration of function 'dlpar_cpu_readd'; did you mean 'raw_cpu_read'? [-Werror=implicit-function-declaration]
dlpar_cpu_readd(cpu);
^~~~~~~~~~~~~~~
raw_cpu_read
cc1: some warnings being treated as errors
vim +1361 arch/powerpc/mm/numa.c
1298
1299 /*
1300 * Update the node maps and sysfs entries for each cpu whose home node
1301 * has changed. Returns 1 when the topology has changed, and 0 otherwise.
1302 *
1303 * readd_cpus: Also readd any CPUs that have changed affinity
1304 */
1305 static int numa_update_cpu_topology(bool readd_cpus)
1306 {
1307 unsigned int cpu, sibling, changed = 0;
1308 struct topology_update_data *updates, *ud;
1309 cpumask_t updated_cpus;
1310 struct device *dev;
1311 int weight, new_nid, i = 0;
1312
1313 if ((!prrn_enabled && !vphn_enabled && topology_inited) ||
1314 topology_update_in_progress)
1315 return 0;
1316
1317 weight = cpumask_weight(&cpu_associativity_changes_mask);
1318 if (!weight)
1319 return 0;
1320
1321 updates = kcalloc(weight, sizeof(*updates), GFP_KERNEL);
1322 if (!updates)
1323 return 0;
1324
1325 topology_update_in_progress = 1;
1326
1327 cpumask_clear(&updated_cpus);
1328
1329 for_each_cpu(cpu, &cpu_associativity_changes_mask) {
1330 /*
1331 * If siblings aren't flagged for changes, updates list
1332 * will be too short. Skip on this update and set for next
1333 * update.
1334 */
1335 if (!cpumask_subset(cpu_sibling_mask(cpu),
1336 &cpu_associativity_changes_mask)) {
1337 pr_info("Sibling bits not set for associativity "
1338 "change, cpu%d\n", cpu);
1339 cpumask_or(&cpu_associativity_changes_mask,
1340 &cpu_associativity_changes_mask,
1341 cpu_sibling_mask(cpu));
1342 cpu = cpu_last_thread_sibling(cpu);
1343 continue;
1344 }
1345
1346 new_nid = find_and_online_cpu_nid(cpu);
1347
1348 if ((new_nid == numa_cpu_lookup_table[cpu]) ||
1349 !cpu_present(cpu)) {
1350 cpumask_andnot(&cpu_associativity_changes_mask,
1351 &cpu_associativity_changes_mask,
1352 cpu_sibling_mask(cpu));
1353 if (cpu_present(cpu))
1354 dbg("Assoc chg gives same node %d for cpu%d\n",
1355 new_nid, cpu);
1356 cpu = cpu_last_thread_sibling(cpu);
1357 continue;
1358 }
1359
1360 if (readd_cpus)
> 1361 dlpar_cpu_readd(cpu);
1362
1363 for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
1364 ud = &updates[i++];
1365 ud->next = &updates[i];
1366 ud->cpu = sibling;
1367 ud->new_nid = new_nid;
1368 ud->old_nid = numa_cpu_lookup_table[sibling];
1369 cpumask_set_cpu(sibling, &updated_cpus);
1370 }
1371 cpu = cpu_last_thread_sibling(cpu);
1372 }
1373
1374 /*
1375 * Prevent processing of 'updates' from overflowing array
1376 * where last entry filled in a 'next' pointer.
1377 */
1378 if (i)
1379 updates[i-1].next = NULL;
1380
1381 pr_debug("Topology update for the following CPUs:\n");
1382 if (cpumask_weight(&updated_cpus)) {
1383 for (ud = &updates[0]; ud; ud = ud->next) {
1384 pr_debug("cpu %d moving from node %d "
1385 "to %d\n", ud->cpu,
1386 ud->old_nid, ud->new_nid);
1387 }
1388 }
1389
1390 /*
1391 * In cases where we have nothing to update (because the updates list
1392 * is too short or because the new topology is same as the old one),
1393 * skip invoking update_cpu_topology() via stop-machine(). This is
1394 * necessary (and not just a fast-path optimization) since stop-machine
1395 * can end up electing a random CPU to run update_cpu_topology(), and
1396 * thus trick us into setting up incorrect cpu-node mappings (since
1397 * 'updates' is kzalloc()'ed).
1398 *
1399 * And for the similar reason, we will skip all the following updating.
1400 */
1401 if (!cpumask_weight(&updated_cpus))
1402 goto out;
1403
1404 stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
1405
1406 /*
1407 * Update the numa-cpu lookup table with the new mappings, even for
1408 * offline CPUs. It is best to perform this update from the stop-
1409 * machine context.
1410 */
1411 stop_machine(update_lookup_table, &updates[0],
1412 cpumask_of(raw_smp_processor_id()));
1413
1414 for (ud = &updates[0]; ud; ud = ud->next) {
1415 unregister_cpu_under_node(ud->cpu, ud->old_nid);
1416 register_cpu_under_node(ud->cpu, ud->new_nid);
1417
1418 dev = get_cpu_device(ud->cpu);
1419 if (dev)
1420 kobject_uevent(&dev->kobj, KOBJ_CHANGE);
1421 cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask);
1422 changed = 1;
1423 }
1424
1425 out:
1426 topology_changed = changed;
1427 topology_update_in_progress = 0;
1428 kfree(updates);
1429 return changed;
1430 }
1431
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
>
> int arch_update_cpu_topology(void)
> {
> - return numa_update_cpu_topology(true);
> + int changed = topology_changed;
> +
> + topology_changed = 0;
> + return changed;
> }
>
Do we need Powerpc override for arch_update_cpu_topology() now? That
topology_changed sometime back doesn't seem to have help. The scheduler
atleast now is neglecting whether the topology changed or not.
Also we can do away with the new topology_changed.
> static void topology_work_fn(struct work_struct *work)
> {
> - rebuild_sched_domains();
> + lock_device_hotplug();
> + if (numa_update_cpu_topology(true))
> + rebuild_sched_domains();
> + unlock_device_hotplug();
> }
Should this hunk be a separate patch by itself to say why
rebuild_sched_domains with a changelog that explains why it should be under
lock_device_hotplug? rebuild_sched_domains already takes cpuset_mutex.
So I am not sure if we need to take device_hotplug_lock.
> static DECLARE_WORK(topology_work, topology_work_fn);
>
> -static void topology_schedule_update(void)
> +void topology_schedule_update(void)
> {
> - schedule_work(&topology_work);
> + if (!topology_update_in_progress)
> + schedule_work(&topology_work);
> }
>
> static void topology_timer_fn(struct timer_list *unused)
> {
> + bool sdo = false;
Is sdo any abbrevation?
> +
> + if (topology_scans < 1)
> + bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
> + nr_cpumask_bits);
Why do we need topology_scan? Just to make sure
cpu_associativity_changes_mask is populated only once?
cant we use a static bool inside the function for the same?
> +
> if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
> - topology_schedule_update();
> - else if (vphn_enabled) {
> + sdo = true;
> + if (vphn_enabled) {
Any reason to remove the else above?
> if (update_cpu_associativity_changes_mask() > 0)
> - topology_schedule_update();
> + sdo = true;
> reset_topology_timer();
> }
> + if (sdo)
> + topology_schedule_update();
> + topology_scans++;
> }
Are the above two hunks necessary? Not getting how the current changes are
different from the previous.
--
Thanks and Regards
Srikar Dronamraju