2012-06-13 11:01:32

by Thomas Gleixner

[permalink] [raw]
Subject: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

Provide a generic interface for setting up and tearing down percpu
threads.

On registration the threads for already online cpus are created and
started. On deregistration (modules) the threads are stoppped.

During hotplug operations the threads are created, started, parked and
unparked. The datastructure for registration provides a pointer to
percpu storage space and optional setup, cleanup, park, unpark
functions. These functions are called when the thread state changes.

Thread functions should look like the following:

int thread_fn(void *cookie)
{
while (!smpboot_thread_check_parking(cookie)) {
do_stuff();
}
return 0;
}

Signed-off-by: Thomas Gleixner <[email protected]>
---
include/linux/smpboot.h | 40 +++++++++
kernel/cpu.c | 6 +
kernel/smpboot.c | 205 ++++++++++++++++++++++++++++++++++++++++++++++++
kernel/smpboot.h | 4
4 files changed, 255 insertions(+)

Index: tip/include/linux/smpboot.h
===================================================================
--- /dev/null
+++ tip/include/linux/smpboot.h
@@ -0,0 +1,40 @@
+#ifndef _LINUX_SMPBOOT_H
+#define _LINUX_SMPBOOT_H
+
+#include <linux/types.h>
+
+struct task_struct;
+/* Cookie handed to the thread_fn*/
+struct smpboot_thread_data;
+
+/**
+ * struct smp_hotplug_thread - CPU hotplug related thread descriptor
+ * @store: Pointer to per cpu storage for the task pointers
+ * @list: List head for core management
+ * @thread_fn: The associated thread function
+ * @setup: Optional setup function, called when the thread gets
+ * operational the first time
+ * @cleanup: Optional cleanup function, called when the thread
+ * should stop (module exit)
+ * @park: Optional park function, called when the thread is
+ * parked (cpu offline)
+ * @unpark: Optional unpark function, called when the thread is
+ * unparked (cpu online)
+ * @thread_comm: The base name of the thread
+ */
+struct smp_hotplug_thread {
+ struct task_struct __percpu **store;
+ struct list_head list;
+ int (*thread_fn)(void *data);
+ void (*setup)(unsigned int cpu);
+ void (*cleanup)(unsigned int cpu, bool online);
+ void (*park)(unsigned int cpu);
+ void (*unpark)(unsigned int cpu);
+ const char *thread_comm;
+};
+
+int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
+void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
+int smpboot_thread_check_parking(struct smpboot_thread_data *td);
+
+#endif
Index: tip/kernel/cpu.c
===================================================================
--- tip.orig/kernel/cpu.c
+++ tip/kernel/cpu.c
@@ -280,6 +280,7 @@ static int __ref _cpu_down(unsigned int
__func__, cpu);
goto out_release;
}
+ smpboot_park_threads(cpu);

err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
@@ -354,6 +355,10 @@ static int __cpuinit _cpu_up(unsigned in
goto out;
}

+ ret = smpboot_create_threads(cpu);
+ if (ret)
+ goto out;
+
ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
if (ret) {
nr_calls--;
@@ -370,6 +375,7 @@ static int __cpuinit _cpu_up(unsigned in

/* Now call notifier in preparation. */
cpu_notify(CPU_ONLINE | mod, hcpu);
+ smpboot_unpark_threads(cpu);

out_notify:
if (ret != 0)
Index: tip/kernel/smpboot.c
===================================================================
--- tip.orig/kernel/smpboot.c
+++ tip/kernel/smpboot.c
@@ -1,11 +1,17 @@
/*
* Common SMP CPU bringup/teardown functions
*/
+#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/smp.h>
#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/export.h>
#include <linux/percpu.h>
+#include <linux/kthread.h>
+#include <linux/smpboot.h>

#include "smpboot.h"

@@ -65,3 +71,202 @@ void __init idle_threads_init(void)
}
}
#endif
+
+static LIST_HEAD(hotplug_threads);
+static DEFINE_MUTEX(smpboot_threads_lock);
+
+struct smpboot_thread_data {
+ unsigned int cpu;
+ unsigned int status;
+ struct smp_hotplug_thread *ht;
+};
+
+enum {
+ HP_THREAD_NONE = 0,
+ HP_THREAD_ACTIVE,
+ HP_THREAD_PARKED,
+};
+
+/**
+ * smpboot_thread_check_parking - percpu hotplug thread loop function
+ * @td: thread data pointer
+ *
+ * Checks for thread stop and park conditions. Calls the necessary
+ * setup, cleanup, park and unpark functions for the registered
+ * thread.
+ *
+ * Returns 1 when the thread should exit, 0 otherwise.
+ */
+int smpboot_thread_check_parking(struct smpboot_thread_data *td)
+{
+ struct smp_hotplug_thread *ht = td->ht;
+
+again:
+ if (kthread_should_stop()) {
+ if (ht->cleanup)
+ ht->cleanup(td->cpu, cpu_online(td->cpu));
+ kfree(td);
+ return 1;
+ }
+
+ BUG_ON(td->cpu != smp_processor_id());
+
+ if (kthread_should_park()) {
+ if (ht->park && td->status == HP_THREAD_ACTIVE) {
+ ht->park(td->cpu);
+ td->status = HP_THREAD_PARKED;
+ }
+ kthread_parkme();
+ /* We might have been woken for stop */
+ goto again;
+ }
+
+ switch (td->status) {
+ case HP_THREAD_NONE:
+ if (ht->setup)
+ ht->setup(td->cpu);
+ td->status = HP_THREAD_ACTIVE;
+ break;
+ case HP_THREAD_PARKED:
+ if (ht->unpark)
+ ht->unpark(td->cpu);
+ td->status = HP_THREAD_ACTIVE;
+ break;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(smpboot_thread_check_parking);
+
+static int
+__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+ struct smpboot_thread_data *td;
+
+ if (tsk)
+ return 0;
+
+ td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
+ if (!td)
+ return -ENOMEM;
+ td->cpu = cpu;
+ td->ht = ht;
+
+ tsk = kthread_create_on_cpu(ht->thread_fn, td, cpu, ht->thread_comm);
+ if (IS_ERR(tsk))
+ return PTR_ERR(tsk);
+
+ get_task_struct(tsk);
+ *per_cpu_ptr(ht->store, cpu) = tsk;
+ return 0;
+}
+
+int smpboot_create_threads(unsigned int cpu)
+{
+ struct smp_hotplug_thread *cur;
+ int ret = 0;
+
+ mutex_lock(&smpboot_threads_lock);
+ list_for_each_entry(cur, &hotplug_threads, list) {
+ ret = __smpboot_create_thread(cur, cpu);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&smpboot_threads_lock);
+ return ret;
+}
+
+static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+ kthread_unpark(tsk);
+}
+
+void smpboot_unpark_threads(unsigned int cpu)
+{
+ struct smp_hotplug_thread *cur;
+
+ mutex_lock(&smpboot_threads_lock);
+ list_for_each_entry(cur, &hotplug_threads, list)
+ smpboot_unpark_thread(cur, cpu);
+ mutex_unlock(&smpboot_threads_lock);
+}
+
+static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+ if (tsk)
+ kthread_park(tsk);
+}
+
+void smpboot_park_threads(unsigned int cpu)
+{
+ struct smp_hotplug_thread *cur;
+
+ mutex_lock(&smpboot_threads_lock);
+ list_for_each_entry(cur, &hotplug_threads, list)
+ smpboot_park_thread(cur, cpu);
+ mutex_unlock(&smpboot_threads_lock);
+}
+
+static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
+{
+ unsigned int cpu;
+
+ /* We need to destroy also the parked threads of offline cpus */
+ for_each_possible_cpu(cpu) {
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+ if (tsk) {
+ kthread_stop(tsk);
+ put_task_struct(tsk);
+ *per_cpu_ptr(ht->store, cpu) = NULL;
+ }
+ }
+}
+
+/**
+ * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
+ * @plug_thread: Hotplug thread descriptor
+ *
+ * Creates and starts the threads on all online cpus.
+ */
+int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+ unsigned int cpu;
+ int ret = 0;
+
+ mutex_lock(&smpboot_threads_lock);
+ for_each_online_cpu(cpu) {
+ ret = __smpboot_create_thread(plug_thread, cpu);
+ if (ret) {
+ smpboot_destroy_threads(plug_thread);
+ goto out;
+ }
+ smpboot_unpark_thread(plug_thread, cpu);
+ }
+ list_add(&plug_thread->list, &hotplug_threads);
+out:
+ mutex_unlock(&smpboot_threads_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
+
+/**
+ * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
+ * @plug_thread: Hotplug thread descriptor
+ *
+ * Stops all threads on all possible cpus.
+ */
+void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+ get_online_cpus();
+ mutex_lock(&smpboot_threads_lock);
+ list_del(&plug_thread->list);
+ smpboot_destroy_threads(plug_thread);
+ mutex_unlock(&smpboot_threads_lock);
+ put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
Index: tip/kernel/smpboot.h
===================================================================
--- tip.orig/kernel/smpboot.h
+++ tip/kernel/smpboot.h
@@ -13,4 +13,8 @@ static inline void idle_thread_set_boot_
static inline void idle_threads_init(void) { }
#endif

+int smpboot_create_threads(unsigned int cpu);
+void smpboot_park_threads(unsigned int cpu);
+void smpboot_unpark_threads(unsigned int cpu);
+
#endif


2012-06-13 18:33:18

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Wed, Jun 13, 2012 at 11:00:54AM -0000, Thomas Gleixner wrote:
> Provide a generic interface for setting up and tearing down percpu
> threads.
>
> On registration the threads for already online cpus are created and
> started. On deregistration (modules) the threads are stoppped.
>
> During hotplug operations the threads are created, started, parked and
> unparked. The datastructure for registration provides a pointer to
> percpu storage space and optional setup, cleanup, park, unpark
> functions. These functions are called when the thread state changes.
>
> Thread functions should look like the following:
>
> int thread_fn(void *cookie)
> {
> while (!smpboot_thread_check_parking(cookie)) {
> do_stuff();
> }
> return 0;
> }

Very cool!!!

So I am currently trying to apply this to RCU's per-CPU kthread.
I don't believe that I need to mess with RCU's per-rcu_node kthread
because it can just have its affinity adjusted when the first CPU
onlines and the last CPU offlines for the corresponding rcu_node.

One question below about the order of parking.

Also, I have not yet figured out how this avoids a parked thread waking
up while the CPU is offline, but I am probably still missing something.

Thanx, Paul

> Signed-off-by: Thomas Gleixner <[email protected]>
> ---
> include/linux/smpboot.h | 40 +++++++++
> kernel/cpu.c | 6 +
> kernel/smpboot.c | 205 ++++++++++++++++++++++++++++++++++++++++++++++++
> kernel/smpboot.h | 4
> 4 files changed, 255 insertions(+)
>
> Index: tip/include/linux/smpboot.h
> ===================================================================
> --- /dev/null
> +++ tip/include/linux/smpboot.h
> @@ -0,0 +1,40 @@
> +#ifndef _LINUX_SMPBOOT_H
> +#define _LINUX_SMPBOOT_H
> +
> +#include <linux/types.h>
> +
> +struct task_struct;
> +/* Cookie handed to the thread_fn*/
> +struct smpboot_thread_data;
> +
> +/**
> + * struct smp_hotplug_thread - CPU hotplug related thread descriptor
> + * @store: Pointer to per cpu storage for the task pointers
> + * @list: List head for core management
> + * @thread_fn: The associated thread function
> + * @setup: Optional setup function, called when the thread gets
> + * operational the first time
> + * @cleanup: Optional cleanup function, called when the thread
> + * should stop (module exit)
> + * @park: Optional park function, called when the thread is
> + * parked (cpu offline)
> + * @unpark: Optional unpark function, called when the thread is
> + * unparked (cpu online)
> + * @thread_comm: The base name of the thread
> + */
> +struct smp_hotplug_thread {
> + struct task_struct __percpu **store;
> + struct list_head list;
> + int (*thread_fn)(void *data);
> + void (*setup)(unsigned int cpu);
> + void (*cleanup)(unsigned int cpu, bool online);
> + void (*park)(unsigned int cpu);
> + void (*unpark)(unsigned int cpu);
> + const char *thread_comm;
> +};
> +
> +int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
> +void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
> +int smpboot_thread_check_parking(struct smpboot_thread_data *td);
> +
> +#endif
> Index: tip/kernel/cpu.c
> ===================================================================
> --- tip.orig/kernel/cpu.c
> +++ tip/kernel/cpu.c
> @@ -280,6 +280,7 @@ static int __ref _cpu_down(unsigned int
> __func__, cpu);
> goto out_release;
> }
> + smpboot_park_threads(cpu);
>
> err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
> if (err) {
> @@ -354,6 +355,10 @@ static int __cpuinit _cpu_up(unsigned in
> goto out;
> }
>
> + ret = smpboot_create_threads(cpu);
> + if (ret)
> + goto out;
> +
> ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
> if (ret) {
> nr_calls--;
> @@ -370,6 +375,7 @@ static int __cpuinit _cpu_up(unsigned in
>
> /* Now call notifier in preparation. */
> cpu_notify(CPU_ONLINE | mod, hcpu);
> + smpboot_unpark_threads(cpu);
>
> out_notify:
> if (ret != 0)
> Index: tip/kernel/smpboot.c
> ===================================================================
> --- tip.orig/kernel/smpboot.c
> +++ tip/kernel/smpboot.c
> @@ -1,11 +1,17 @@
> /*
> * Common SMP CPU bringup/teardown functions
> */
> +#include <linux/cpu.h>
> #include <linux/err.h>
> #include <linux/smp.h>
> #include <linux/init.h>
> +#include <linux/list.h>
> +#include <linux/slab.h>
> #include <linux/sched.h>
> +#include <linux/export.h>
> #include <linux/percpu.h>
> +#include <linux/kthread.h>
> +#include <linux/smpboot.h>
>
> #include "smpboot.h"
>
> @@ -65,3 +71,202 @@ void __init idle_threads_init(void)
> }
> }
> #endif
> +
> +static LIST_HEAD(hotplug_threads);
> +static DEFINE_MUTEX(smpboot_threads_lock);
> +
> +struct smpboot_thread_data {
> + unsigned int cpu;
> + unsigned int status;
> + struct smp_hotplug_thread *ht;
> +};
> +
> +enum {
> + HP_THREAD_NONE = 0,
> + HP_THREAD_ACTIVE,
> + HP_THREAD_PARKED,
> +};
> +
> +/**
> + * smpboot_thread_check_parking - percpu hotplug thread loop function
> + * @td: thread data pointer
> + *
> + * Checks for thread stop and park conditions. Calls the necessary
> + * setup, cleanup, park and unpark functions for the registered
> + * thread.
> + *
> + * Returns 1 when the thread should exit, 0 otherwise.
> + */
> +int smpboot_thread_check_parking(struct smpboot_thread_data *td)
> +{
> + struct smp_hotplug_thread *ht = td->ht;
> +
> +again:
> + if (kthread_should_stop()) {
> + if (ht->cleanup)
> + ht->cleanup(td->cpu, cpu_online(td->cpu));
> + kfree(td);
> + return 1;
> + }
> +
> + BUG_ON(td->cpu != smp_processor_id());
> +
> + if (kthread_should_park()) {
> + if (ht->park && td->status == HP_THREAD_ACTIVE) {
> + ht->park(td->cpu);
> + td->status = HP_THREAD_PARKED;
> + }
> + kthread_parkme();
> + /* We might have been woken for stop */
> + goto again;
> + }
> +
> + switch (td->status) {
> + case HP_THREAD_NONE:
> + if (ht->setup)
> + ht->setup(td->cpu);
> + td->status = HP_THREAD_ACTIVE;
> + break;
> + case HP_THREAD_PARKED:
> + if (ht->unpark)
> + ht->unpark(td->cpu);
> + td->status = HP_THREAD_ACTIVE;
> + break;
> + }
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(smpboot_thread_check_parking);
> +
> +static int
> +__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
> +{
> + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
> + struct smpboot_thread_data *td;
> +
> + if (tsk)
> + return 0;
> +
> + td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
> + if (!td)
> + return -ENOMEM;
> + td->cpu = cpu;
> + td->ht = ht;
> +
> + tsk = kthread_create_on_cpu(ht->thread_fn, td, cpu, ht->thread_comm);
> + if (IS_ERR(tsk))
> + return PTR_ERR(tsk);
> +
> + get_task_struct(tsk);
> + *per_cpu_ptr(ht->store, cpu) = tsk;
> + return 0;
> +}
> +
> +int smpboot_create_threads(unsigned int cpu)
> +{
> + struct smp_hotplug_thread *cur;
> + int ret = 0;
> +
> + mutex_lock(&smpboot_threads_lock);
> + list_for_each_entry(cur, &hotplug_threads, list) {
> + ret = __smpboot_create_thread(cur, cpu);
> + if (ret)
> + break;
> + }
> + mutex_unlock(&smpboot_threads_lock);
> + return ret;
> +}
> +
> +static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
> +{
> + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
> +
> + kthread_unpark(tsk);
> +}
> +
> +void smpboot_unpark_threads(unsigned int cpu)
> +{
> + struct smp_hotplug_thread *cur;
> +
> + mutex_lock(&smpboot_threads_lock);
> + list_for_each_entry(cur, &hotplug_threads, list)
> + smpboot_unpark_thread(cur, cpu);
> + mutex_unlock(&smpboot_threads_lock);
> +}
> +
> +static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
> +{
> + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
> +
> + if (tsk)
> + kthread_park(tsk);
> +}
> +
> +void smpboot_park_threads(unsigned int cpu)
> +{
> + struct smp_hotplug_thread *cur;
> +
> + mutex_lock(&smpboot_threads_lock);
> + list_for_each_entry(cur, &hotplug_threads, list)

Shouldn't this be list_for_each_entry_reverse()? Yes, the notifiers
still run in the same order for both online and offline, but all uses
of smpboot_park_threads() would be new, so should be OK with the
proper ordering, right?

> + smpboot_park_thread(cur, cpu);
> + mutex_unlock(&smpboot_threads_lock);
> +}
> +
> +static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
> +{
> + unsigned int cpu;
> +
> + /* We need to destroy also the parked threads of offline cpus */
> + for_each_possible_cpu(cpu) {
> + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
> +
> + if (tsk) {
> + kthread_stop(tsk);
> + put_task_struct(tsk);
> + *per_cpu_ptr(ht->store, cpu) = NULL;
> + }
> + }
> +}
> +
> +/**
> + * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
> + * @plug_thread: Hotplug thread descriptor
> + *
> + * Creates and starts the threads on all online cpus.
> + */
> +int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
> +{
> + unsigned int cpu;
> + int ret = 0;
> +
> + mutex_lock(&smpboot_threads_lock);
> + for_each_online_cpu(cpu) {
> + ret = __smpboot_create_thread(plug_thread, cpu);
> + if (ret) {
> + smpboot_destroy_threads(plug_thread);
> + goto out;
> + }
> + smpboot_unpark_thread(plug_thread, cpu);
> + }
> + list_add(&plug_thread->list, &hotplug_threads);
> +out:
> + mutex_unlock(&smpboot_threads_lock);
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
> +
> +/**
> + * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
> + * @plug_thread: Hotplug thread descriptor
> + *
> + * Stops all threads on all possible cpus.
> + */
> +void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
> +{
> + get_online_cpus();
> + mutex_lock(&smpboot_threads_lock);
> + list_del(&plug_thread->list);
> + smpboot_destroy_threads(plug_thread);
> + mutex_unlock(&smpboot_threads_lock);
> + put_online_cpus();
> +}
> +EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
> Index: tip/kernel/smpboot.h
> ===================================================================
> --- tip.orig/kernel/smpboot.h
> +++ tip/kernel/smpboot.h
> @@ -13,4 +13,8 @@ static inline void idle_thread_set_boot_
> static inline void idle_threads_init(void) { }
> #endif
>
> +int smpboot_create_threads(unsigned int cpu);
> +void smpboot_park_threads(unsigned int cpu);
> +void smpboot_unpark_threads(unsigned int cpu);
> +
> #endif
>
>

2012-06-13 18:56:27

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Wed, 13 Jun 2012, Paul E. McKenney wrote:
> On Wed, Jun 13, 2012 at 11:00:54AM -0000, Thomas Gleixner wrote:
>
> So I am currently trying to apply this to RCU's per-CPU kthread.
> I don't believe that I need to mess with RCU's per-rcu_node kthread
> because it can just have its affinity adjusted when the first CPU
> onlines and the last CPU offlines for the corresponding rcu_node.
>
> One question below about the order of parking.
>
> Also, I have not yet figured out how this avoids a parked thread waking
> up while the CPU is offline, but I am probably still missing something.

If it's just a spurious wakeup then it goes back to sleep right away
as nothing cleared the park bit.

If something calls unpark(), then it's toast. I should put a warning
into the code somewhere to catch that case.

> > +void smpboot_park_threads(unsigned int cpu)
> > +{
> > + struct smp_hotplug_thread *cur;
> > +
> > + mutex_lock(&smpboot_threads_lock);
> > + list_for_each_entry(cur, &hotplug_threads, list)
>
> Shouldn't this be list_for_each_entry_reverse()? Yes, the notifiers
> still run in the same order for both online and offline, but all uses
> of smpboot_park_threads() would be new, so should be OK with the
> proper ordering, right?

Duh, yes

Thanks,

tglx

2012-06-13 18:58:10

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Wed, Jun 13, 2012 at 11:00:54AM -0000, Thomas Gleixner wrote:
> Provide a generic interface for setting up and tearing down percpu
> threads.
>
> On registration the threads for already online cpus are created and
> started. On deregistration (modules) the threads are stoppped.
>
> During hotplug operations the threads are created, started, parked and
> unparked. The datastructure for registration provides a pointer to
> percpu storage space and optional setup, cleanup, park, unpark
> functions. These functions are called when the thread state changes.
>
> Thread functions should look like the following:
>
> int thread_fn(void *cookie)
> {
> while (!smpboot_thread_check_parking(cookie)) {
> do_stuff();
> }
> return 0;
> }
>
> Signed-off-by: Thomas Gleixner <[email protected]>

[ . . . ]

> Index: tip/kernel/cpu.c
> ===================================================================
> --- tip.orig/kernel/cpu.c
> +++ tip/kernel/cpu.c
> @@ -280,6 +280,7 @@ static int __ref _cpu_down(unsigned int
> __func__, cpu);
> goto out_release;
> }
> + smpboot_park_threads(cpu);
>
> err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
> if (err) {
> @@ -354,6 +355,10 @@ static int __cpuinit _cpu_up(unsigned in
> goto out;
> }
>
> + ret = smpboot_create_threads(cpu);
> + if (ret)
> + goto out;
> +
> ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
> if (ret) {
> nr_calls--;
> @@ -370,6 +375,7 @@ static int __cpuinit _cpu_up(unsigned in
>
> /* Now call notifier in preparation. */
> cpu_notify(CPU_ONLINE | mod, hcpu);
> + smpboot_unpark_threads(cpu);

OK, RCU must use the lower-level interfaces, given that one of
then CPU_ONLINE notifiers might invoke synchronize_rcu().

Thanx, Paul

2012-06-13 19:02:59

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Wed, 13 Jun 2012, Paul E. McKenney wrote:
> On Wed, Jun 13, 2012 at 11:00:54AM -0000, Thomas Gleixner wrote:
> > /* Now call notifier in preparation. */
> > cpu_notify(CPU_ONLINE | mod, hcpu);
> > + smpboot_unpark_threads(cpu);
>
> OK, RCU must use the lower-level interfaces, given that one of
> then CPU_ONLINE notifiers might invoke synchronize_rcu().

We can start the threads before the notifiers. There is no
restriction.

Thanks,

tglx

2012-06-13 19:18:20

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Wed, Jun 13, 2012 at 09:02:55PM +0200, Thomas Gleixner wrote:
> On Wed, 13 Jun 2012, Paul E. McKenney wrote:
> > On Wed, Jun 13, 2012 at 11:00:54AM -0000, Thomas Gleixner wrote:
> > > /* Now call notifier in preparation. */
> > > cpu_notify(CPU_ONLINE | mod, hcpu);
> > > + smpboot_unpark_threads(cpu);
> >
> > OK, RCU must use the lower-level interfaces, given that one of
> > then CPU_ONLINE notifiers might invoke synchronize_rcu().
>
> We can start the threads before the notifiers. There is no
> restriction.

Sounds very good in both cases!

Thanx, Paul

2012-06-13 20:47:34

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Wed, Jun 13, 2012 at 12:17:45PM -0700, Paul E. McKenney wrote:
> On Wed, Jun 13, 2012 at 09:02:55PM +0200, Thomas Gleixner wrote:
> > On Wed, 13 Jun 2012, Paul E. McKenney wrote:
> > > On Wed, Jun 13, 2012 at 11:00:54AM -0000, Thomas Gleixner wrote:
> > > > /* Now call notifier in preparation. */
> > > > cpu_notify(CPU_ONLINE | mod, hcpu);
> > > > + smpboot_unpark_threads(cpu);
> > >
> > > OK, RCU must use the lower-level interfaces, given that one of
> > > then CPU_ONLINE notifiers might invoke synchronize_rcu().
> >
> > We can start the threads before the notifiers. There is no
> > restriction.
>
> Sounds very good in both cases!

Just for reference, here is what I am using.


Anticipatory changes in advance of Thomas's next patchset

Signed-off-by: Paul E. McKenney <[email protected]>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ab5d82..510ba5e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -372,10 +372,10 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
if (ret != 0)
goto out_notify;
BUG_ON(!cpu_online(cpu));
+ smpboot_unpark_threads(cpu);

/* Now call notifier in preparation. */
cpu_notify(CPU_ONLINE | mod, hcpu);
- smpboot_unpark_threads(cpu);

out_notify:
if (ret != 0)
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index d7a3659..7d54e7c 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -201,7 +201,7 @@ void smpboot_park_threads(unsigned int cpu)
struct smp_hotplug_thread *cur;

mutex_lock(&smpboot_threads_lock);
- list_for_each_entry(cur, &hotplug_threads, list)
+ list_for_each_entry_reverse(cur, &hotplug_threads, list)
smpboot_park_thread(cur, cpu);
mutex_unlock(&smpboot_threads_lock);
}

2012-06-14 04:51:33

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Wed, Jun 13, 2012 at 01:47:25PM -0700, Paul E. McKenney wrote:
> On Wed, Jun 13, 2012 at 12:17:45PM -0700, Paul E. McKenney wrote:
> > On Wed, Jun 13, 2012 at 09:02:55PM +0200, Thomas Gleixner wrote:
> > > On Wed, 13 Jun 2012, Paul E. McKenney wrote:
> > > > On Wed, Jun 13, 2012 at 11:00:54AM -0000, Thomas Gleixner wrote:
> > > > > /* Now call notifier in preparation. */
> > > > > cpu_notify(CPU_ONLINE | mod, hcpu);
> > > > > + smpboot_unpark_threads(cpu);
> > > >
> > > > OK, RCU must use the lower-level interfaces, given that one of
> > > > then CPU_ONLINE notifiers might invoke synchronize_rcu().
> > >
> > > We can start the threads before the notifiers. There is no
> > > restriction.
> >
> > Sounds very good in both cases!
>
> Just for reference, here is what I am using.

And here is a buggy first attempt to make RCU use the smpboot interfaces.
Probably still bugs in my adaptation, as it still hangs in the first
attempt to offline a CPU. If I revert the softirq smpboot commit, the
offline still hangs somewhere near the __stop_machine() processing, but
the system continues running otherwise. Will continue debugging tomorrow.

When doing this sort of conversion, renaming the per-CPU variable used
to hold the kthreads' task_struct pointers is highly recommended --
failing to do so cost me substantial confusion. ;-)

Thanx, Paul

------------------------------------------------------------------------

rcu: Use smp_hotplug_thread facility for RCU's per-CPU kthread

Bring RCU into the new-age CPU-hotplug fold by modifying RCU's per-CPU
kthread code to use the new smp_hotplug_thread facility.

Not-signed-off-by: Paul E. McKenney <[email protected]>

rcutree.c | 4 -
rcutree.h | 2
rcutree_plugin.h | 177 ++++++++-----------------------------------------------
rcutree_trace.c | 3
4 files changed, 27 insertions(+), 159 deletions(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0da7b88..7813d7d 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -125,7 +125,6 @@ static int rcu_scheduler_fully_active __read_mostly;
*/
static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
DEFINE_PER_CPU(char, rcu_cpu_has_work);

@@ -1458,7 +1457,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */

/* Adjust any no-longer-needed kthreads. */
- rcu_stop_cpu_kthread(cpu);
rcu_node_kthread_setaffinity(rnp, -1);

/* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
@@ -2514,11 +2512,9 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
case CPU_ONLINE:
case CPU_DOWN_FAILED:
rcu_node_kthread_setaffinity(rnp, -1);
- rcu_cpu_kthread_setrt(cpu, 1);
break;
case CPU_DOWN_PREPARE:
rcu_node_kthread_setaffinity(rnp, cpu);
- rcu_cpu_kthread_setrt(cpu, 0);
break;
case CPU_DYING:
case CPU_DYING_FROZEN:
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 7f5d138..70883af 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -434,7 +434,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
unsigned long flags);
-static void rcu_stop_cpu_kthread(int cpu);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_print_detail_task_stall(struct rcu_state *rsp);
static int rcu_print_task_stall(struct rcu_node *rnp);
@@ -472,7 +471,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
static void invoke_rcu_node_kthread(struct rcu_node *rnp);
static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
#endif /* #ifdef CONFIG_RCU_BOOST */
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
static void __cpuinit rcu_prepare_kthreads(int cpu);
static void rcu_prepare_for_idle_init(int cpu);
static void rcu_cleanup_after_idle(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 2411000..f789341 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
*/

#include <linux/delay.h>
+#include <linux/smpboot.h>

#define RCU_KTHREAD_PRIO 1

@@ -1449,25 +1450,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
return 0;
}

-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Stop the RCU's per-CPU kthread when its CPU goes offline,.
- */
-static void rcu_stop_cpu_kthread(int cpu)
-{
- struct task_struct *t;
-
- /* Stop the CPU's kthread. */
- t = per_cpu(rcu_cpu_kthread_task, cpu);
- if (t != NULL) {
- per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
- kthread_stop(t);
- }
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
static void rcu_kthread_do_work(void)
{
rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
@@ -1490,30 +1472,6 @@ static void invoke_rcu_node_kthread(struct rcu_node *rnp)
}

/*
- * Set the specified CPU's kthread to run RT or not, as specified by
- * the to_rt argument. The CPU-hotplug locks are held, so the task
- * is not going away.
- */
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
-{
- int policy;
- struct sched_param sp;
- struct task_struct *t;
-
- t = per_cpu(rcu_cpu_kthread_task, cpu);
- if (t == NULL)
- return;
- if (to_rt) {
- policy = SCHED_FIFO;
- sp.sched_priority = RCU_KTHREAD_PRIO;
- } else {
- policy = SCHED_NORMAL;
- sp.sched_priority = 0;
- }
- sched_setscheduler_nocheck(t, policy, &sp);
-}
-
-/*
* Timer handler to initiate the waking up of per-CPU kthreads that
* have yielded the CPU due to excess numbers of RCU callbacks.
* We wake up the per-rcu_node kthread, which in turn will wake up
@@ -1553,63 +1511,35 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
}

/*
- * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
- * This can happen while the corresponding CPU is either coming online
- * or going offline. We cannot wait until the CPU is fully online
- * before starting the kthread, because the various notifier functions
- * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
- * the corresponding CPU is online.
- *
- * Return 1 if the kthread needs to stop, 0 otherwise.
- *
- * Caller must disable bh. This function can momentarily enable it.
- */
-static int rcu_cpu_kthread_should_stop(int cpu)
-{
- while (cpu_is_offline(cpu) ||
- !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
- smp_processor_id() != cpu) {
- if (kthread_should_stop())
- return 1;
- per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
- per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
- local_bh_enable();
- schedule_timeout_uninterruptible(1);
- if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
- local_bh_disable();
- }
- per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
- return 0;
-}
-
-/*
* Per-CPU kernel thread that invokes RCU callbacks. This replaces the
* RCU softirq used in flavors and configurations of RCU that do not
* support RCU priority boosting.
*/
-static int rcu_cpu_kthread(void *arg)
+static int rcu_cpu_kthread(void *cookie)
{
- int cpu = (int)(long)arg;
unsigned long flags;
+ struct sched_param sp;
int spincnt = 0;
- unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
+ unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
char work;
- char *workp = &per_cpu(rcu_cpu_has_work, cpu);
+ char *workp = &__get_cpu_var(rcu_cpu_has_work);

- trace_rcu_utilization("Start CPU kthread@init");
+ trace_rcu_utilization("Start CPU kthread@unpark");
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
for (;;) {
*statusp = RCU_KTHREAD_WAITING;
trace_rcu_utilization("End CPU kthread@rcu_wait");
- rcu_wait(*workp != 0 || kthread_should_stop());
+ rcu_wait(*workp != 0 ||
+ smpboot_thread_check_parking(cookie));
trace_rcu_utilization("Start CPU kthread@rcu_wait");
local_bh_disable();
- if (rcu_cpu_kthread_should_stop(cpu)) {
+ if (smpboot_thread_check_parking(cookie)) {
local_bh_enable();
break;
}
*statusp = RCU_KTHREAD_RUNNING;
- per_cpu(rcu_cpu_kthread_loops, cpu)++;
+ this_cpu_inc(rcu_cpu_kthread_loops);
local_irq_save(flags);
work = *workp;
*workp = 0;
@@ -1624,59 +1554,14 @@ static int rcu_cpu_kthread(void *arg)
if (spincnt > 10) {
*statusp = RCU_KTHREAD_YIELDING;
trace_rcu_utilization("End CPU kthread@rcu_yield");
- rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
+ rcu_yield(rcu_cpu_kthread_timer,
+ smp_processor_id());
trace_rcu_utilization("Start CPU kthread@rcu_yield");
spincnt = 0;
}
}
*statusp = RCU_KTHREAD_STOPPED;
- trace_rcu_utilization("End CPU kthread@term");
- return 0;
-}
-
-/*
- * Spawn a per-CPU kthread, setting up affinity and priority.
- * Because the CPU hotplug lock is held, no other CPU will be attempting
- * to manipulate rcu_cpu_kthread_task. There might be another CPU
- * attempting to access it during boot, but the locking in kthread_bind()
- * will enforce sufficient ordering.
- *
- * Please note that we cannot simply refuse to wake up the per-CPU
- * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
- * which can result in softlockup complaints if the task ends up being
- * idle for more than a couple of minutes.
- *
- * However, please note also that we cannot bind the per-CPU kthread to its
- * CPU until that CPU is fully online. We also cannot wait until the
- * CPU is fully online before we create its per-CPU kthread, as this would
- * deadlock the system when CPU notifiers tried waiting for grace
- * periods. So we bind the per-CPU kthread to its CPU only if the CPU
- * is online. If its CPU is not yet fully online, then the code in
- * rcu_cpu_kthread() will wait until it is fully online, and then do
- * the binding.
- */
-static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
-{
- struct sched_param sp;
- struct task_struct *t;
-
- if (!rcu_scheduler_fully_active ||
- per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
- return 0;
- t = kthread_create_on_node(rcu_cpu_kthread,
- (void *)(long)cpu,
- cpu_to_node(cpu),
- "rcuc/%d", cpu);
- if (IS_ERR(t))
- return PTR_ERR(t);
- if (cpu_online(cpu))
- kthread_bind(t, cpu);
- per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
- WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
- sp.sched_priority = RCU_KTHREAD_PRIO;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- per_cpu(rcu_cpu_kthread_task, cpu) = t;
- wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
+ trace_rcu_utilization("End CPU kthread@park");
return 0;
}

@@ -1788,6 +1673,12 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
}

+static struct smp_hotplug_thread rcu_cpu_thread_spec = {
+ .store = &rcu_cpu_kthread_task,
+ .thread_fn = rcu_cpu_kthread,
+ .thread_comm = "rcuc/%u",
+};
+
/*
* Spawn all kthreads -- called as soon as the scheduler is running.
*/
@@ -1797,11 +1688,9 @@ static int __init rcu_spawn_kthreads(void)
struct rcu_node *rnp;

rcu_scheduler_fully_active = 1;
- for_each_possible_cpu(cpu) {
+ BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
+ for_each_possible_cpu(cpu)
per_cpu(rcu_cpu_has_work, cpu) = 0;
- if (cpu_online(cpu))
- (void)rcu_spawn_one_cpu_kthread(cpu);
- }
rnp = rcu_get_root(rcu_state);
(void)rcu_spawn_one_node_kthread(rcu_state, rnp);
if (NUM_RCU_NODES > 1) {
@@ -1818,11 +1707,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
struct rcu_node *rnp = rdp->mynode;

/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
- if (rcu_scheduler_fully_active) {
- (void)rcu_spawn_one_cpu_kthread(cpu);
- if (rnp->node_kthread_task == NULL)
- (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
- }
+ if (rcu_scheduler_fully_active &&
+ rnp->node_kthread_task == NULL)
+ (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
}

#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1846,22 +1733,10 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
{
}

-#ifdef CONFIG_HOTPLUG_CPU
-
-static void rcu_stop_cpu_kthread(int cpu)
-{
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
}

-static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
-{
-}
-
static int __init rcu_scheduler_really_started(void)
{
rcu_scheduler_fully_active = 1;
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d4bc16d..6b4c76b 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -83,11 +83,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
rdp->nxttail[RCU_WAIT_TAIL]],
".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
#ifdef CONFIG_RCU_BOOST
- seq_printf(m, " kt=%d/%c/%d ktl=%x",
+ seq_printf(m, " kt=%d/%c ktl=%x",
per_cpu(rcu_cpu_has_work, rdp->cpu),
convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
rdp->cpu)),
- per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
#endif /* #ifdef CONFIG_RCU_BOOST */
seq_printf(m, " b=%ld", rdp->blimit);

2012-06-14 08:09:06

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Wed, 2012-06-13 at 20:56 +0200, Thomas Gleixner wrote:
> If it's just a spurious wakeup then it goes back to sleep right away
> as nothing cleared the park bit.

Your spurious wakeup will have destroyed the binding though. So you need
to be careful.

2012-06-14 08:17:37

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, 2012-06-14 at 10:08 +0200, Peter Zijlstra wrote:
> On Wed, 2012-06-13 at 20:56 +0200, Thomas Gleixner wrote:
> > If it's just a spurious wakeup then it goes back to sleep right away
> > as nothing cleared the park bit.
>
> Your spurious wakeup will have destroyed the binding though. So you need
> to be careful.

We should probably do something like the below..

TJ does this wreck workqueues? Its somewhat 'creative' in that regard
and really wants fixing.

---
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5018,6 +5018,8 @@ void do_set_cpus_allowed(struct task_str

cpumask_copy(&p->cpus_allowed, new_mask);
p->nr_cpus_allowed = cpumask_weight(new_mask);
+ if (p->nr_cpus_allowed != 1)
+ p->flags &= ~PF_THREAD_BOUND;
}

/*

2012-06-14 08:33:00

by Srivatsa S. Bhat

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On 06/13/2012 04:30 PM, Thomas Gleixner wrote:

> Provide a generic interface for setting up and tearing down percpu
> threads.
>
> On registration the threads for already online cpus are created and
> started. On deregistration (modules) the threads are stoppped.
>
> During hotplug operations the threads are created, started, parked and
> unparked. The datastructure for registration provides a pointer to
> percpu storage space and optional setup, cleanup, park, unpark
> functions. These functions are called when the thread state changes.
>
> Thread functions should look like the following:
>
> int thread_fn(void *cookie)
> {
> while (!smpboot_thread_check_parking(cookie)) {
> do_stuff();
> }
> return 0;
> }
>


This patchset looks great! But I have some comments below:


> Index: tip/kernel/cpu.c
> ===================================================================
> --- tip.orig/kernel/cpu.c
> +++ tip/kernel/cpu.c
> @@ -280,6 +280,7 @@ static int __ref _cpu_down(unsigned int
> __func__, cpu);
> goto out_release;
> }
> + smpboot_park_threads(cpu);
>


If cpu_down fails further down, don't we want to unpark these threads
as part of error recovery?

> err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
> if (err) {
> @@ -354,6 +355,10 @@ static int __cpuinit _cpu_up(unsigned in
> goto out;
> }
>
> + ret = smpboot_create_threads(cpu);
> + if (ret)
> + goto out;
> +


Here also, we might want to clean up on error right?

> ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
> if (ret) {
> nr_calls--;
> @@ -370,6 +375,7 @@ static int __cpuinit _cpu_up(unsigned in
>
> /* Now call notifier in preparation. */
> cpu_notify(CPU_ONLINE | mod, hcpu);
> + smpboot_unpark_threads(cpu);
>
> out_notify:
> if (ret != 0)



Regards,
Srivatsa S. Bhat

2012-06-14 08:38:01

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, 14 Jun 2012, Peter Zijlstra wrote:

> On Wed, 2012-06-13 at 20:56 +0200, Thomas Gleixner wrote:
> > If it's just a spurious wakeup then it goes back to sleep right away
> > as nothing cleared the park bit.
>
> Your spurious wakeup will have destroyed the binding though. So you need
> to be careful.

We explicitely rebind it on unpark().

2012-06-14 08:38:14

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, 2012-06-14 at 10:37 +0200, Thomas Gleixner wrote:
> On Thu, 14 Jun 2012, Peter Zijlstra wrote:
>
> > On Wed, 2012-06-13 at 20:56 +0200, Thomas Gleixner wrote:
> > > If it's just a spurious wakeup then it goes back to sleep right away
> > > as nothing cleared the park bit.
> >
> > Your spurious wakeup will have destroyed the binding though. So you need
> > to be careful.
>
> We explicitely rebind it on unpark().

OK, no worries then.

2012-06-14 08:45:00

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, 14 Jun 2012, Srivatsa S. Bhat wrote:
> On 06/13/2012 04:30 PM, Thomas Gleixner wrote:
> > @@ -280,6 +280,7 @@ static int __ref _cpu_down(unsigned int
> > __func__, cpu);
> > goto out_release;
> > }
> > + smpboot_park_threads(cpu);
> >
>
>
> If cpu_down fails further down, don't we want to unpark these threads
> as part of error recovery?

Right.

> > err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
> > if (err) {
> > @@ -354,6 +355,10 @@ static int __cpuinit _cpu_up(unsigned in
> > goto out;
> > }
> >
> > + ret = smpboot_create_threads(cpu);
> > + if (ret)
> > + goto out;
> > +
>
>
> Here also, we might want to clean up on error right?

Good question. If we failed to create a thread, we can't online the
cpu, but we might try again later. So now the question is whether we
really need to completely destroy the already created and parked ones
or just leave them around. No real opinion on that, I just picked the
lazy way :)

Thanks,

tglx

2012-06-14 11:20:50

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

B1;2601;0cOn Wed, 13 Jun 2012, Paul E. McKenney wrote:

> On Wed, Jun 13, 2012 at 01:47:25PM -0700, Paul E. McKenney wrote:
> > On Wed, Jun 13, 2012 at 12:17:45PM -0700, Paul E. McKenney wrote:
> > > On Wed, Jun 13, 2012 at 09:02:55PM +0200, Thomas Gleixner wrote:
> > > > On Wed, 13 Jun 2012, Paul E. McKenney wrote:
> > > > > On Wed, Jun 13, 2012 at 11:00:54AM -0000, Thomas Gleixner wrote:
> > > > > > /* Now call notifier in preparation. */
> > > > > > cpu_notify(CPU_ONLINE | mod, hcpu);
> > > > > > + smpboot_unpark_threads(cpu);
> > > > >
> > > > > OK, RCU must use the lower-level interfaces, given that one of
> > > > > then CPU_ONLINE notifiers might invoke synchronize_rcu().
> > > >
> > > > We can start the threads before the notifiers. There is no
> > > > restriction.
> > >
> > > Sounds very good in both cases!
> >
> > Just for reference, here is what I am using.
>
> And here is a buggy first attempt to make RCU use the smpboot interfaces.
> Probably still bugs in my adaptation, as it still hangs in the first
> attempt to offline a CPU. If I revert the softirq smpboot commit, the
> offline still hangs somewhere near the __stop_machine() processing, but
> the system continues running otherwise. Will continue debugging tomorrow.

I gave it a quick shot, but I was not able to reproduce the hang yet.

But looking at the thread function made me look into rcu_yield() and I
really wonder what kind of drug induced that particular piece of
horror.

I can't figure out why this yield business is necessary at all. The
commit logs are as helpful as the missing code comments :)

I suspect that it's some starvation issue. But if we need it, then
can't we replace it with something sane like the (untested) patch
below?

Thanks,

tglx
---
kernel/rcutree.h | 2 -
kernel/rcutree_plugin.h | 89 ++++++++++--------------------------------------
2 files changed, 19 insertions(+), 72 deletions(-)

Index: tip/kernel/rcutree.h
===================================================================
--- tip.orig/kernel/rcutree.h
+++ tip/kernel/rcutree.h
@@ -469,8 +469,6 @@ static void rcu_boost_kthread_setaffinit
static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
struct rcu_node *rnp,
int rnp_index);
-static void invoke_rcu_node_kthread(struct rcu_node *rnp);
-static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
#endif /* #ifdef CONFIG_RCU_BOOST */
static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
static void __cpuinit rcu_prepare_kthreads(int cpu);
Index: tip/kernel/rcutree_plugin.h
===================================================================
--- tip.orig/kernel/rcutree_plugin.h
+++ tip/kernel/rcutree_plugin.h
@@ -1217,6 +1217,16 @@ static void rcu_initiate_boost_trace(str

#endif /* #else #ifdef CONFIG_RCU_TRACE */

+static void rcu_wake_cond(struct task_struct *t, int status)
+{
+ /*
+ * If the thread is yielding, only wake it when this
+ * is invoked from idle
+ */
+ if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
+ wake_up_process(t);
+}
+
/*
* Carry out RCU priority boosting on the task indicated by ->exp_tasks
* or ->boost_tasks, advancing the pointer to the next task in the
@@ -1289,17 +1299,6 @@ static int rcu_boost(struct rcu_node *rn
}

/*
- * Timer handler to initiate waking up of boost kthreads that
- * have yielded the CPU due to excessive numbers of tasks to
- * boost. We wake up the per-rcu_node kthread, which in turn
- * will wake up the booster kthread.
- */
-static void rcu_boost_kthread_timer(unsigned long arg)
-{
- invoke_rcu_node_kthread((struct rcu_node *)arg);
-}
-
-/*
* Priority-boosting kthread. One per leaf rcu_node and one for the
* root rcu_node.
*/
@@ -1322,8 +1321,9 @@ static int rcu_boost_kthread(void *arg)
else
spincnt = 0;
if (spincnt > 10) {
+ rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
trace_rcu_utilization("End boost kthread@rcu_yield");
- rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
+ schedule_timeout_interruptible(2);
trace_rcu_utilization("Start boost kthread@rcu_yield");
spincnt = 0;
}
@@ -1361,8 +1361,8 @@ static void rcu_initiate_boost(struct rc
rnp->boost_tasks = rnp->gp_tasks;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
t = rnp->boost_kthread_task;
- if (t != NULL)
- wake_up_process(t);
+ if (t)
+ rcu_wake_cond(t, rnp->boost_kthread_status);
} else {
rcu_initiate_boost_trace(rnp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1379,8 +1379,10 @@ static void invoke_rcu_callbacks_kthread
local_irq_save(flags);
__this_cpu_write(rcu_cpu_has_work, 1);
if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
- current != __this_cpu_read(rcu_cpu_kthread_task))
- wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
+ current != __this_cpu_read(rcu_cpu_kthread_task)) {
+ rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
+ __this_cpu_read(rcu_cpu_kthread_status));
+ }
local_irq_restore(flags);
}

@@ -1476,20 +1478,6 @@ static void rcu_kthread_do_work(void)
}

/*
- * Wake up the specified per-rcu_node-structure kthread.
- * Because the per-rcu_node kthreads are immortal, we don't need
- * to do anything to keep them alive.
- */
-static void invoke_rcu_node_kthread(struct rcu_node *rnp)
-{
- struct task_struct *t;
-
- t = rnp->node_kthread_task;
- if (t != NULL)
- wake_up_process(t);
-}
-
-/*
* Set the specified CPU's kthread to run RT or not, as specified by
* the to_rt argument. The CPU-hotplug locks are held, so the task
* is not going away.
@@ -1514,45 +1502,6 @@ static void rcu_cpu_kthread_setrt(int cp
}

/*
- * Timer handler to initiate the waking up of per-CPU kthreads that
- * have yielded the CPU due to excess numbers of RCU callbacks.
- * We wake up the per-rcu_node kthread, which in turn will wake up
- * the booster kthread.
- */
-static void rcu_cpu_kthread_timer(unsigned long arg)
-{
- struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
- struct rcu_node *rnp = rdp->mynode;
-
- atomic_or(rdp->grpmask, &rnp->wakemask);
- invoke_rcu_node_kthread(rnp);
-}
-
-/*
- * Drop to non-real-time priority and yield, but only after posting a
- * timer that will cause us to regain our real-time priority if we
- * remain preempted. Either way, we restore our real-time priority
- * before returning.
- */
-static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
-{
- struct sched_param sp;
- struct timer_list yield_timer;
- int prio = current->rt_priority;
-
- setup_timer_on_stack(&yield_timer, f, arg);
- mod_timer(&yield_timer, jiffies + 2);
- sp.sched_priority = 0;
- sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
- set_user_nice(current, 19);
- schedule();
- set_user_nice(current, 0);
- sp.sched_priority = prio;
- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
- del_timer(&yield_timer);
-}
-
-/*
* Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
* This can happen while the corresponding CPU is either coming online
* or going offline. We cannot wait until the CPU is fully online
@@ -1624,7 +1573,7 @@ static int rcu_cpu_kthread(void *arg)
if (spincnt > 10) {
*statusp = RCU_KTHREAD_YIELDING;
trace_rcu_utilization("End CPU kthread@rcu_yield");
- rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
+ schedule_timeout_interruptible(2);
trace_rcu_utilization("Start CPU kthread@rcu_yield");
spincnt = 0;
}

2012-06-14 12:59:49

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, Jun 14, 2012 at 01:20:39PM +0200, Thomas Gleixner wrote:
> B1;2601;0cOn Wed, 13 Jun 2012, Paul E. McKenney wrote:
>
> > On Wed, Jun 13, 2012 at 01:47:25PM -0700, Paul E. McKenney wrote:
> > > On Wed, Jun 13, 2012 at 12:17:45PM -0700, Paul E. McKenney wrote:
> > > > On Wed, Jun 13, 2012 at 09:02:55PM +0200, Thomas Gleixner wrote:
> > > > > On Wed, 13 Jun 2012, Paul E. McKenney wrote:
> > > > > > On Wed, Jun 13, 2012 at 11:00:54AM -0000, Thomas Gleixner wrote:
> > > > > > > /* Now call notifier in preparation. */
> > > > > > > cpu_notify(CPU_ONLINE | mod, hcpu);
> > > > > > > + smpboot_unpark_threads(cpu);
> > > > > >
> > > > > > OK, RCU must use the lower-level interfaces, given that one of
> > > > > > then CPU_ONLINE notifiers might invoke synchronize_rcu().
> > > > >
> > > > > We can start the threads before the notifiers. There is no
> > > > > restriction.
> > > >
> > > > Sounds very good in both cases!
> > >
> > > Just for reference, here is what I am using.
> >
> > And here is a buggy first attempt to make RCU use the smpboot interfaces.
> > Probably still bugs in my adaptation, as it still hangs in the first
> > attempt to offline a CPU. If I revert the softirq smpboot commit, the
> > offline still hangs somewhere near the __stop_machine() processing, but
> > the system continues running otherwise. Will continue debugging tomorrow.
>
> I gave it a quick shot, but I was not able to reproduce the hang yet.

Really? I have a strictly Western-Hemisphere bug? ;-)

> But looking at the thread function made me look into rcu_yield() and I
> really wonder what kind of drug induced that particular piece of
> horror.

When you are working on something like RCU priority boosting, no other
drug is in any way necessary. ;-)

> I can't figure out why this yield business is necessary at all. The
> commit logs are as helpful as the missing code comments :)
>
> I suspect that it's some starvation issue. But if we need it, then
> can't we replace it with something sane like the (untested) patch
> below?

Yep, starvation. I will take a look at your approach after I wake
up a bit more.

Thanx, Paul

> Thanks,
>
> tglx
> ---
> kernel/rcutree.h | 2 -
> kernel/rcutree_plugin.h | 89 ++++++++++--------------------------------------
> 2 files changed, 19 insertions(+), 72 deletions(-)
>
> Index: tip/kernel/rcutree.h
> ===================================================================
> --- tip.orig/kernel/rcutree.h
> +++ tip/kernel/rcutree.h
> @@ -469,8 +469,6 @@ static void rcu_boost_kthread_setaffinit
> static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
> struct rcu_node *rnp,
> int rnp_index);
> -static void invoke_rcu_node_kthread(struct rcu_node *rnp);
> -static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
> #endif /* #ifdef CONFIG_RCU_BOOST */
> static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
> static void __cpuinit rcu_prepare_kthreads(int cpu);
> Index: tip/kernel/rcutree_plugin.h
> ===================================================================
> --- tip.orig/kernel/rcutree_plugin.h
> +++ tip/kernel/rcutree_plugin.h
> @@ -1217,6 +1217,16 @@ static void rcu_initiate_boost_trace(str
>
> #endif /* #else #ifdef CONFIG_RCU_TRACE */
>
> +static void rcu_wake_cond(struct task_struct *t, int status)
> +{
> + /*
> + * If the thread is yielding, only wake it when this
> + * is invoked from idle
> + */
> + if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
> + wake_up_process(t);
> +}
> +
> /*
> * Carry out RCU priority boosting on the task indicated by ->exp_tasks
> * or ->boost_tasks, advancing the pointer to the next task in the
> @@ -1289,17 +1299,6 @@ static int rcu_boost(struct rcu_node *rn
> }
>
> /*
> - * Timer handler to initiate waking up of boost kthreads that
> - * have yielded the CPU due to excessive numbers of tasks to
> - * boost. We wake up the per-rcu_node kthread, which in turn
> - * will wake up the booster kthread.
> - */
> -static void rcu_boost_kthread_timer(unsigned long arg)
> -{
> - invoke_rcu_node_kthread((struct rcu_node *)arg);
> -}
> -
> -/*
> * Priority-boosting kthread. One per leaf rcu_node and one for the
> * root rcu_node.
> */
> @@ -1322,8 +1321,9 @@ static int rcu_boost_kthread(void *arg)
> else
> spincnt = 0;
> if (spincnt > 10) {
> + rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
> trace_rcu_utilization("End boost kthread@rcu_yield");
> - rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
> + schedule_timeout_interruptible(2);
> trace_rcu_utilization("Start boost kthread@rcu_yield");
> spincnt = 0;
> }
> @@ -1361,8 +1361,8 @@ static void rcu_initiate_boost(struct rc
> rnp->boost_tasks = rnp->gp_tasks;
> raw_spin_unlock_irqrestore(&rnp->lock, flags);
> t = rnp->boost_kthread_task;
> - if (t != NULL)
> - wake_up_process(t);
> + if (t)
> + rcu_wake_cond(t, rnp->boost_kthread_status);
> } else {
> rcu_initiate_boost_trace(rnp);
> raw_spin_unlock_irqrestore(&rnp->lock, flags);
> @@ -1379,8 +1379,10 @@ static void invoke_rcu_callbacks_kthread
> local_irq_save(flags);
> __this_cpu_write(rcu_cpu_has_work, 1);
> if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
> - current != __this_cpu_read(rcu_cpu_kthread_task))
> - wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
> + current != __this_cpu_read(rcu_cpu_kthread_task)) {
> + rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
> + __this_cpu_read(rcu_cpu_kthread_status));
> + }
> local_irq_restore(flags);
> }
>
> @@ -1476,20 +1478,6 @@ static void rcu_kthread_do_work(void)
> }
>
> /*
> - * Wake up the specified per-rcu_node-structure kthread.
> - * Because the per-rcu_node kthreads are immortal, we don't need
> - * to do anything to keep them alive.
> - */
> -static void invoke_rcu_node_kthread(struct rcu_node *rnp)
> -{
> - struct task_struct *t;
> -
> - t = rnp->node_kthread_task;
> - if (t != NULL)
> - wake_up_process(t);
> -}
> -
> -/*
> * Set the specified CPU's kthread to run RT or not, as specified by
> * the to_rt argument. The CPU-hotplug locks are held, so the task
> * is not going away.
> @@ -1514,45 +1502,6 @@ static void rcu_cpu_kthread_setrt(int cp
> }
>
> /*
> - * Timer handler to initiate the waking up of per-CPU kthreads that
> - * have yielded the CPU due to excess numbers of RCU callbacks.
> - * We wake up the per-rcu_node kthread, which in turn will wake up
> - * the booster kthread.
> - */
> -static void rcu_cpu_kthread_timer(unsigned long arg)
> -{
> - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
> - struct rcu_node *rnp = rdp->mynode;
> -
> - atomic_or(rdp->grpmask, &rnp->wakemask);
> - invoke_rcu_node_kthread(rnp);
> -}
> -
> -/*
> - * Drop to non-real-time priority and yield, but only after posting a
> - * timer that will cause us to regain our real-time priority if we
> - * remain preempted. Either way, we restore our real-time priority
> - * before returning.
> - */
> -static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
> -{
> - struct sched_param sp;
> - struct timer_list yield_timer;
> - int prio = current->rt_priority;
> -
> - setup_timer_on_stack(&yield_timer, f, arg);
> - mod_timer(&yield_timer, jiffies + 2);
> - sp.sched_priority = 0;
> - sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
> - set_user_nice(current, 19);
> - schedule();
> - set_user_nice(current, 0);
> - sp.sched_priority = prio;
> - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
> - del_timer(&yield_timer);
> -}
> -
> -/*
> * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
> * This can happen while the corresponding CPU is either coming online
> * or going offline. We cannot wait until the CPU is fully online
> @@ -1624,7 +1573,7 @@ static int rcu_cpu_kthread(void *arg)
> if (spincnt > 10) {
> *statusp = RCU_KTHREAD_YIELDING;
> trace_rcu_utilization("End CPU kthread@rcu_yield");
> - rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
> + schedule_timeout_interruptible(2);
> trace_rcu_utilization("Start CPU kthread@rcu_yield");
> spincnt = 0;
> }
>

2012-06-14 13:01:39

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, 2012-06-14 at 05:59 -0700, Paul E. McKenney wrote:
> Yep, starvation.

What's the particular starvation case?

2012-06-14 13:32:26

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, 14 Jun 2012, Paul E. McKenney wrote:
> On Thu, Jun 14, 2012 at 01:20:39PM +0200, Thomas Gleixner wrote:
> > I gave it a quick shot, but I was not able to reproduce the hang yet.
>
> Really? I have a strictly Western-Hemisphere bug? ;-)

I guess I need to fire up rcu torture to make it surface.

> > But looking at the thread function made me look into rcu_yield() and I
> > really wonder what kind of drug induced that particular piece of
> > horror.
>
> When you are working on something like RCU priority boosting, no other
> drug is in any way necessary. ;-)

And how do we protect minors from that ?

> > I can't figure out why this yield business is necessary at all. The
> > commit logs are as helpful as the missing code comments :)
> >
> > I suspect that it's some starvation issue. But if we need it, then
> > can't we replace it with something sane like the (untested) patch
> > below?
>
> Yep, starvation. I will take a look at your approach after I wake
> up a bit more.

Btw, if that simpler yield approach is working and I can't see why it
shouldn't then you can get rid of the node task as well. The only
purpose of it is to push up the priority of yielding tasks, right?

Thanks,

tglx

2012-06-14 13:47:42

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, 14 Jun 2012, Thomas Gleixner wrote:

> On Thu, 14 Jun 2012, Paul E. McKenney wrote:
> > On Thu, Jun 14, 2012 at 01:20:39PM +0200, Thomas Gleixner wrote:
> > > I gave it a quick shot, but I was not able to reproduce the hang yet.
> >
> > Really? I have a strictly Western-Hemisphere bug? ;-)
>
> I guess I need to fire up rcu torture to make it surface.
>
> > > But looking at the thread function made me look into rcu_yield() and I
> > > really wonder what kind of drug induced that particular piece of
> > > horror.
> >
> > When you are working on something like RCU priority boosting, no other
> > drug is in any way necessary. ;-)
>
> And how do we protect minors from that ?
>
> > > I can't figure out why this yield business is necessary at all. The
> > > commit logs are as helpful as the missing code comments :)
> > >
> > > I suspect that it's some starvation issue. But if we need it, then
> > > can't we replace it with something sane like the (untested) patch
> > > below?
> >
> > Yep, starvation. I will take a look at your approach after I wake
> > up a bit more.
>
> Btw, if that simpler yield approach is working and I can't see why it
> shouldn't then you can get rid of the node task as well. The only
> purpose of it is to push up the priority of yielding tasks, right?

Ah, missed that it calls rcu_initiate_boost() as well....

2012-06-14 14:12:19

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, 14 Jun 2012, Thomas Gleixner wrote:
> On Thu, 14 Jun 2012, Thomas Gleixner wrote:
>
> > On Thu, 14 Jun 2012, Paul E. McKenney wrote:
> > > On Thu, Jun 14, 2012 at 01:20:39PM +0200, Thomas Gleixner wrote:
> > > > I gave it a quick shot, but I was not able to reproduce the hang yet.
> > >
> > > Really? I have a strictly Western-Hemisphere bug? ;-)
> >
> > I guess I need to fire up rcu torture to make it surface.
> >
> > > > But looking at the thread function made me look into rcu_yield() and I
> > > > really wonder what kind of drug induced that particular piece of
> > > > horror.
> > >
> > > When you are working on something like RCU priority boosting, no other
> > > drug is in any way necessary. ;-)
> >
> > And how do we protect minors from that ?
> >
> > > > I can't figure out why this yield business is necessary at all. The
> > > > commit logs are as helpful as the missing code comments :)
> > > >
> > > > I suspect that it's some starvation issue. But if we need it, then
> > > > can't we replace it with something sane like the (untested) patch
> > > > below?
> > >
> > > Yep, starvation. I will take a look at your approach after I wake
> > > up a bit more.
> >
> > Btw, if that simpler yield approach is working and I can't see why it
> > shouldn't then you can get rid of the node task as well. The only
> > purpose of it is to push up the priority of yielding tasks, right?
>
> Ah, missed that it calls rcu_initiate_boost() as well....

And looking further, I really don't understand why it's doing
that. That node thread is only woken by these weird yield timers.

Thanks,

tglx

2012-06-14 14:47:56

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, Jun 14, 2012 at 03:01:27PM +0200, Peter Zijlstra wrote:
> On Thu, 2012-06-14 at 05:59 -0700, Paul E. McKenney wrote:
> > Yep, starvation.
>
> What's the particular starvation case?

RCU callback processing consumes the entire CPU in RCU_BOOST case where
processing runs at real-time priority. This is analogous to RT throttling
in the scheduler.

Thanx, Paul

2012-06-14 14:50:53

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, Jun 14, 2012 at 03:32:19PM +0200, Thomas Gleixner wrote:
> On Thu, 14 Jun 2012, Paul E. McKenney wrote:
> > On Thu, Jun 14, 2012 at 01:20:39PM +0200, Thomas Gleixner wrote:
> > > I gave it a quick shot, but I was not able to reproduce the hang yet.
> >
> > Really? I have a strictly Western-Hemisphere bug? ;-)
>
> I guess I need to fire up rcu torture to make it surface.

A simple offline was triggering it for me. Perhaps some of my debug
code was inappropriate, will retry.

> > > But looking at the thread function made me look into rcu_yield() and I
> > > really wonder what kind of drug induced that particular piece of
> > > horror.
> >
> > When you are working on something like RCU priority boosting, no other
> > drug is in any way necessary. ;-)
>
> And how do we protect minors from that ?

We rely on their own sense of self-preservation preventing them from
getting involved in such insanity.

> > > I can't figure out why this yield business is necessary at all. The
> > > commit logs are as helpful as the missing code comments :)
> > >
> > > I suspect that it's some starvation issue. But if we need it, then
> > > can't we replace it with something sane like the (untested) patch
> > > below?
> >
> > Yep, starvation. I will take a look at your approach after I wake
> > up a bit more.
>
> Btw, if that simpler yield approach is working and I can't see why it
> shouldn't then you can get rid of the node task as well. The only
> purpose of it is to push up the priority of yielding tasks, right?

It also boosts the priority of preempted RCU read-side critical sections.

Thanx, Paul

2012-06-14 14:56:37

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, 2012-06-14 at 07:47 -0700, Paul E. McKenney wrote:
> RCU callback processing consumes the entire CPU in RCU_BOOST case where
> processing runs at real-time priority. This is analogous to RT throttling
> in the scheduler.

But previously we can in non-preemptible softirq context, why would if
behave differently when done from a RT task?

Also, no its not quite like the throttling, that really idles the cpu
even if there's no SCHED_OTHER tasks to run.

2012-06-14 15:02:15

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, 14 Jun 2012, Paul E. McKenney wrote:
> On Thu, Jun 14, 2012 at 03:32:19PM +0200, Thomas Gleixner wrote:
> > On Thu, 14 Jun 2012, Paul E. McKenney wrote:
> > > On Thu, Jun 14, 2012 at 01:20:39PM +0200, Thomas Gleixner wrote:
> > > > I gave it a quick shot, but I was not able to reproduce the hang yet.
> > >
> > > Really? I have a strictly Western-Hemisphere bug? ;-)
> >
> > I guess I need to fire up rcu torture to make it surface.
>
> A simple offline was triggering it for me. Perhaps some of my debug
> code was inappropriate, will retry.
>
> > > > But looking at the thread function made me look into rcu_yield() and I
> > > > really wonder what kind of drug induced that particular piece of
> > > > horror.
> > >
> > > When you are working on something like RCU priority boosting, no other
> > > drug is in any way necessary. ;-)
> >
> > And how do we protect minors from that ?
>
> We rely on their own sense of self-preservation preventing them from
> getting involved in such insanity.
>
> > > > I can't figure out why this yield business is necessary at all. The
> > > > commit logs are as helpful as the missing code comments :)
> > > >
> > > > I suspect that it's some starvation issue. But if we need it, then
> > > > can't we replace it with something sane like the (untested) patch
> > > > below?
> > >
> > > Yep, starvation. I will take a look at your approach after I wake
> > > up a bit more.
> >
> > Btw, if that simpler yield approach is working and I can't see why it
> > shouldn't then you can get rid of the node task as well. The only
> > purpose of it is to push up the priority of yielding tasks, right?
>
> It also boosts the priority of preempted RCU read-side critical sections.

Though the only way how this thread is invoked is via the timeout of
that yield timer. So I really have a hard time for understanding that.

cpu_kthread()
....
yield()
timer fires -> mark cpu in mask and wakeup node kthread

node_kthread()
do magic boost invocation
push priority of cpu_kthread marked in mask

For the boost thread it looks like:

boost_kthread()
.....
yield()
timer fires -> wakeup node kthread

node_kthread()
do magic boost invocation, but no prio adjustment of boost thread.

/me scratches head

2012-06-14 15:03:52

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, Jun 14, 2012 at 04:12:14PM +0200, Thomas Gleixner wrote:
> On Thu, 14 Jun 2012, Thomas Gleixner wrote:
> > On Thu, 14 Jun 2012, Thomas Gleixner wrote:
> >
> > > On Thu, 14 Jun 2012, Paul E. McKenney wrote:
> > > > On Thu, Jun 14, 2012 at 01:20:39PM +0200, Thomas Gleixner wrote:
> > > > > I gave it a quick shot, but I was not able to reproduce the hang yet.
> > > >
> > > > Really? I have a strictly Western-Hemisphere bug? ;-)
> > >
> > > I guess I need to fire up rcu torture to make it surface.
> > >
> > > > > But looking at the thread function made me look into rcu_yield() and I
> > > > > really wonder what kind of drug induced that particular piece of
> > > > > horror.
> > > >
> > > > When you are working on something like RCU priority boosting, no other
> > > > drug is in any way necessary. ;-)
> > >
> > > And how do we protect minors from that ?
> > >
> > > > > I can't figure out why this yield business is necessary at all. The
> > > > > commit logs are as helpful as the missing code comments :)
> > > > >
> > > > > I suspect that it's some starvation issue. But if we need it, then
> > > > > can't we replace it with something sane like the (untested) patch
> > > > > below?
> > > >
> > > > Yep, starvation. I will take a look at your approach after I wake
> > > > up a bit more.
> > >
> > > Btw, if that simpler yield approach is working and I can't see why it
> > > shouldn't then you can get rid of the node task as well. The only
> > > purpose of it is to push up the priority of yielding tasks, right?
> >
> > Ah, missed that it calls rcu_initiate_boost() as well....
>
> And looking further, I really don't understand why it's doing
> that. That node thread is only woken by these weird yield timers.

If your patch works out, it indeed might be possible to get rid of
->node_kthread_task. The ->boost_kthread_task needs to stay, however.

Thanx, Paul

2012-06-14 15:07:34

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, 14 Jun 2012, Peter Zijlstra wrote:

> On Thu, 2012-06-14 at 07:47 -0700, Paul E. McKenney wrote:
> > RCU callback processing consumes the entire CPU in RCU_BOOST case where
> > processing runs at real-time priority. This is analogous to RT throttling
> > in the scheduler.
>
> But previously we can in non-preemptible softirq context, why would if
> behave differently when done from a RT task?

softirqs are different. They loop ten times and then wake ksoftirqd
which runs with sched_other.

Though that's wonky, because if an interrupt arrives before ksoftirqd
can take over we loop another 10 times in irq_exit(). Rinse and
repeat.....

And the main difference is that the scheduler does not yell on the
softirq context, but it yells when an rt task monopolizes the cpu.

Thanks,

tglx

2012-06-14 15:08:12

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, 14 Jun 2012, Paul E. McKenney wrote:
> On Thu, Jun 14, 2012 at 04:12:14PM +0200, Thomas Gleixner wrote:
> > > > Btw, if that simpler yield approach is working and I can't see why it
> > > > shouldn't then you can get rid of the node task as well. The only
> > > > purpose of it is to push up the priority of yielding tasks, right?
> > >
> > > Ah, missed that it calls rcu_initiate_boost() as well....
> >
> > And looking further, I really don't understand why it's doing
> > that. That node thread is only woken by these weird yield timers.
>
> If your patch works out, it indeed might be possible to get rid of
> ->node_kthread_task. The ->boost_kthread_task needs to stay, however.

That's right.

2012-06-14 15:38:55

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, Jun 14, 2012 at 07:50:38AM -0700, Paul E. McKenney wrote:
> On Thu, Jun 14, 2012 at 03:32:19PM +0200, Thomas Gleixner wrote:
> > On Thu, 14 Jun 2012, Paul E. McKenney wrote:
> > > On Thu, Jun 14, 2012 at 01:20:39PM +0200, Thomas Gleixner wrote:
> > > > I gave it a quick shot, but I was not able to reproduce the hang yet.
> > >
> > > Really? I have a strictly Western-Hemisphere bug? ;-)
> >
> > I guess I need to fire up rcu torture to make it surface.
>
> A simple offline was triggering it for me. Perhaps some of my debug
> code was inappropriate, will retry.

And removing my debug code did the trick. I guess some of it was really
just bug code. In any case, it now passes short rcutorture testing for
me as well.

However, I do see the splat shown below. The splat confuses me greatly,
as I thought that this kthread should be affinitied to a single CPU and
thus smp_processor_id() should refrain from splatting. I don't see any
sort of "I broke affinity" message from the scheduler.

Thanx, Paul

[ 293.647090] rcu-torture:rcu_torture_onoff task: onlining 1
[ 293.921180] BUG: using smp_processor_id() in preemptible [00000000] code: ksoftirqd/1/13
[ 293.923686] ------------[ cut here ]------------
[ 293.924467] kernel BUG at /media/homes/git/linux-2.6-tip.test/kernel/smpboot.c:107!
[ 293.924855] invalid opcode: 0000 [#1] PREEMPT SMP
[ 293.924855] Modules linked in:
[ 293.924855]
[ 293.924855] Pid: 13, comm: ksoftirqd/1 Not tainted 3.5.0-rc1+ #1424 Bochs Bochs
[ 293.924855] EIP: 0060:[<c108e3e5>] EFLAGS: 00010202 CPU: 0
[ 293.924855] EIP is at smpboot_thread_check_parking+0xb5/0xc0
[ 293.924855] EAX: 00000000 EBX: df4003c0 ECX: 00000000 EDX: 00000000
[ 293.924855] ESI: 00000001 EDI: c19a1ec0 EBP: df47ff54 ESP: df47ff48
[ 293.924855] DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
[ 293.924855] CR0: 8005003b CR2: b7782000 CR3: 02cfd000 CR4: 00000690
[ 293.924855] DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
[ 293.924855] DR6: ffff0ff0 DR7: 00000400
[ 293.924855] Process ksoftirqd/1 (pid: 13, ti=df47e000 task=df480a20 task.ti=df47e000)
[ 293.924855] Stack:
[ 293.924855] df480a20 df4003c0 df480a20 df47ff6c c1039cef 00000000 df459ed4 df4003c0
[ 293.924855] c1039cd0 df47ffe4 c105490f 00000001 00000001 df4003c0 00000000 c16e5050
[ 293.924855] dead4ead ffffffff ffffffff c335f98c c34fa620 00000000 c18bb502 df47ffa4
[ 293.924855] Call Trace:
[ 293.924855] [<c1039cef>] run_ksoftirqd+0x1f/0x110
[ 293.924855] [<c1039cd0>] ? __do_softirq+0x330/0x330
[ 293.924855] [<c105490f>] kthread+0x8f/0xa0
[ 293.924855] [<c16e5050>] ? nmi_espfix_stack+0x2a/0x6a
[ 293.924855] [<c16e0000>] ? sock_rps_save_rxhash.isra.31.part.32+0xd6/0x127
[ 293.924855] [<c1054880>] ? flush_kthread_worker+0xe0/0xe0
[ 293.924855] [<c16eb17a>] kernel_thread_helper+0x6/0xd
[ 293.924855] Code: 85 d2 74 04 8b 03 ff d2 c7 43 04 01 00 00 00 31 c0 5b 5e 5f 5d c3 83 f8 02 74 07 5b 31 c0 5e 5f 5d c3 8b 57 1c 85 d2 75 db eb dd <0f> 0b 89 f6 8d bc 27 00 00 00 00 55 89 e5 57 bf ff ff ff ff 56
[ 293.924855] EIP: [<c108e3e5>] smpboot_thread_check_parking+0xb5/0xc0 SS:ESP 0068:df47ff48
[ 293.960654] rcu-torture:rcu_torture_onoff task: onlined 1

2012-06-14 15:46:13

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, Jun 14, 2012 at 04:56:29PM +0200, Peter Zijlstra wrote:
> On Thu, 2012-06-14 at 07:47 -0700, Paul E. McKenney wrote:
> > RCU callback processing consumes the entire CPU in RCU_BOOST case where
> > processing runs at real-time priority. This is analogous to RT throttling
> > in the scheduler.
>
> But previously we can in non-preemptible softirq context, why would if
> behave differently when done from a RT task?

In -rt, yes, but in mainline, ksoftirqd does not run at RT prio, right?

> Also, no its not quite like the throttling, that really idles the cpu
> even if there's no SCHED_OTHER tasks to run.

Agreed, not -exactly- like throttling, but it has a broadly similar
goal, namely to prevent a given type of processing from starving
everything else in the system.

That said, why does throttling idle the CPU even if there is no other
SCHED_OTHER tasks to run?

Thanx, Paul

2012-06-14 16:19:57

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, 14 Jun 2012, Paul E. McKenney wrote:

> On Thu, Jun 14, 2012 at 07:50:38AM -0700, Paul E. McKenney wrote:
> > On Thu, Jun 14, 2012 at 03:32:19PM +0200, Thomas Gleixner wrote:
> > > On Thu, 14 Jun 2012, Paul E. McKenney wrote:
> > > > On Thu, Jun 14, 2012 at 01:20:39PM +0200, Thomas Gleixner wrote:
> > > > > I gave it a quick shot, but I was not able to reproduce the hang yet.
> > > >
> > > > Really? I have a strictly Western-Hemisphere bug? ;-)
> > >
> > > I guess I need to fire up rcu torture to make it surface.
> >
> > A simple offline was triggering it for me. Perhaps some of my debug
> > code was inappropriate, will retry.
>
> And removing my debug code did the trick. I guess some of it was really
> just bug code. In any case, it now passes short rcutorture testing for
> me as well.
>
> However, I do see the splat shown below. The splat confuses me greatly,
> as I thought that this kthread should be affinitied to a single CPU and
> thus smp_processor_id() should refrain from splatting. I don't see any
> sort of "I broke affinity" message from the scheduler.

Hmm. Is that reproducible ?

2012-06-14 16:20:44

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, 14 Jun 2012, Paul E. McKenney wrote:

> On Thu, Jun 14, 2012 at 04:56:29PM +0200, Peter Zijlstra wrote:
> > On Thu, 2012-06-14 at 07:47 -0700, Paul E. McKenney wrote:
> > > RCU callback processing consumes the entire CPU in RCU_BOOST case where
> > > processing runs at real-time priority. This is analogous to RT throttling
> > > in the scheduler.
> >
> > But previously we can in non-preemptible softirq context, why would if
> > behave differently when done from a RT task?
>
> In -rt, yes, but in mainline, ksoftirqd does not run at RT prio, right?
>
> > Also, no its not quite like the throttling, that really idles the cpu
> > even if there's no SCHED_OTHER tasks to run.
>
> Agreed, not -exactly- like throttling, but it has a broadly similar
> goal, namely to prevent a given type of processing from starving
> everything else in the system.
>
> That said, why does throttling idle the CPU even if there is no other
> SCHED_OTHER tasks to run?

For simplicity reasons :)

Thanks,

tglx

2012-06-14 16:58:44

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, Jun 14, 2012 at 06:19:50PM +0200, Thomas Gleixner wrote:
> On Thu, 14 Jun 2012, Paul E. McKenney wrote:
>
> > On Thu, Jun 14, 2012 at 07:50:38AM -0700, Paul E. McKenney wrote:
> > > On Thu, Jun 14, 2012 at 03:32:19PM +0200, Thomas Gleixner wrote:
> > > > On Thu, 14 Jun 2012, Paul E. McKenney wrote:
> > > > > On Thu, Jun 14, 2012 at 01:20:39PM +0200, Thomas Gleixner wrote:
> > > > > > I gave it a quick shot, but I was not able to reproduce the hang yet.
> > > > >
> > > > > Really? I have a strictly Western-Hemisphere bug? ;-)
> > > >
> > > > I guess I need to fire up rcu torture to make it surface.
> > >
> > > A simple offline was triggering it for me. Perhaps some of my debug
> > > code was inappropriate, will retry.
> >
> > And removing my debug code did the trick. I guess some of it was really
> > just bug code. In any case, it now passes short rcutorture testing for
> > me as well.
> >
> > However, I do see the splat shown below. The splat confuses me greatly,
> > as I thought that this kthread should be affinitied to a single CPU and
> > thus smp_processor_id() should refrain from splatting. I don't see any
> > sort of "I broke affinity" message from the scheduler.
>
> Hmm. Is that reproducible ?

Twice so far, will retry.

Thanx, Paul

2012-06-14 22:40:35

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Wed, Jun 13, 2012 at 09:51:25PM -0700, Paul E. McKenney wrote:
> On Wed, Jun 13, 2012 at 01:47:25PM -0700, Paul E. McKenney wrote:
> > On Wed, Jun 13, 2012 at 12:17:45PM -0700, Paul E. McKenney wrote:
> > > On Wed, Jun 13, 2012 at 09:02:55PM +0200, Thomas Gleixner wrote:
> > > > On Wed, 13 Jun 2012, Paul E. McKenney wrote:
> > > > > On Wed, Jun 13, 2012 at 11:00:54AM -0000, Thomas Gleixner wrote:
> > > > > > /* Now call notifier in preparation. */
> > > > > > cpu_notify(CPU_ONLINE | mod, hcpu);
> > > > > > + smpboot_unpark_threads(cpu);
> > > > >
> > > > > OK, RCU must use the lower-level interfaces, given that one of
> > > > > then CPU_ONLINE notifiers might invoke synchronize_rcu().
> > > >
> > > > We can start the threads before the notifiers. There is no
> > > > restriction.
> > >
> > > Sounds very good in both cases!
> >
> > Just for reference, here is what I am using.
>
> And here is a buggy first attempt to make RCU use the smpboot interfaces.
> Probably still bugs in my adaptation, as it still hangs in the first
> attempt to offline a CPU. If I revert the softirq smpboot commit, the
> offline still hangs somewhere near the __stop_machine() processing, but
> the system continues running otherwise. Will continue debugging tomorrow.
>
> When doing this sort of conversion, renaming the per-CPU variable used
> to hold the kthreads' task_struct pointers is highly recommended --
> failing to do so cost me substantial confusion. ;-)

OK, if it is going to actually work, I guess I can sign it off.

Thanx, Paul

------------------------------------------------------------------------

rcu: Use smp_hotplug_thread facility for RCU's per-CPU kthread

Bring RCU into the new-age CPU-hotplug fold by modifying RCU's per-CPU
kthread code to use the new smp_hotplug_thread facility.

Signed-off-by: Paul E. McKenney <[email protected]>

rcutree.c | 4 -
rcutree.h | 2
rcutree_plugin.h | 177 ++++++++-----------------------------------------------
rcutree_trace.c | 3
4 files changed, 27 insertions(+), 159 deletions(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0da7b88..7813d7d 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -125,7 +125,6 @@ static int rcu_scheduler_fully_active __read_mostly;
*/
static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
DEFINE_PER_CPU(char, rcu_cpu_has_work);

@@ -1458,7 +1457,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */

/* Adjust any no-longer-needed kthreads. */
- rcu_stop_cpu_kthread(cpu);
rcu_node_kthread_setaffinity(rnp, -1);

/* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
@@ -2514,11 +2512,9 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
case CPU_ONLINE:
case CPU_DOWN_FAILED:
rcu_node_kthread_setaffinity(rnp, -1);
- rcu_cpu_kthread_setrt(cpu, 1);
break;
case CPU_DOWN_PREPARE:
rcu_node_kthread_setaffinity(rnp, cpu);
- rcu_cpu_kthread_setrt(cpu, 0);
break;
case CPU_DYING:
case CPU_DYING_FROZEN:
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 7f5d138..70883af 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -434,7 +434,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
unsigned long flags);
-static void rcu_stop_cpu_kthread(int cpu);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_print_detail_task_stall(struct rcu_state *rsp);
static int rcu_print_task_stall(struct rcu_node *rnp);
@@ -472,7 +471,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
static void invoke_rcu_node_kthread(struct rcu_node *rnp);
static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
#endif /* #ifdef CONFIG_RCU_BOOST */
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
static void __cpuinit rcu_prepare_kthreads(int cpu);
static void rcu_prepare_for_idle_init(int cpu);
static void rcu_cleanup_after_idle(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 2411000..f789341 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
*/

#include <linux/delay.h>
+#include <linux/smpboot.h>

#define RCU_KTHREAD_PRIO 1

@@ -1449,25 +1450,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
return 0;
}

-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Stop the RCU's per-CPU kthread when its CPU goes offline,.
- */
-static void rcu_stop_cpu_kthread(int cpu)
-{
- struct task_struct *t;
-
- /* Stop the CPU's kthread. */
- t = per_cpu(rcu_cpu_kthread_task, cpu);
- if (t != NULL) {
- per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
- kthread_stop(t);
- }
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
static void rcu_kthread_do_work(void)
{
rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
@@ -1490,30 +1472,6 @@ static void invoke_rcu_node_kthread(struct rcu_node *rnp)
}

/*
- * Set the specified CPU's kthread to run RT or not, as specified by
- * the to_rt argument. The CPU-hotplug locks are held, so the task
- * is not going away.
- */
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
-{
- int policy;
- struct sched_param sp;
- struct task_struct *t;
-
- t = per_cpu(rcu_cpu_kthread_task, cpu);
- if (t == NULL)
- return;
- if (to_rt) {
- policy = SCHED_FIFO;
- sp.sched_priority = RCU_KTHREAD_PRIO;
- } else {
- policy = SCHED_NORMAL;
- sp.sched_priority = 0;
- }
- sched_setscheduler_nocheck(t, policy, &sp);
-}
-
-/*
* Timer handler to initiate the waking up of per-CPU kthreads that
* have yielded the CPU due to excess numbers of RCU callbacks.
* We wake up the per-rcu_node kthread, which in turn will wake up
@@ -1553,63 +1511,35 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
}

/*
- * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
- * This can happen while the corresponding CPU is either coming online
- * or going offline. We cannot wait until the CPU is fully online
- * before starting the kthread, because the various notifier functions
- * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
- * the corresponding CPU is online.
- *
- * Return 1 if the kthread needs to stop, 0 otherwise.
- *
- * Caller must disable bh. This function can momentarily enable it.
- */
-static int rcu_cpu_kthread_should_stop(int cpu)
-{
- while (cpu_is_offline(cpu) ||
- !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
- smp_processor_id() != cpu) {
- if (kthread_should_stop())
- return 1;
- per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
- per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
- local_bh_enable();
- schedule_timeout_uninterruptible(1);
- if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
- local_bh_disable();
- }
- per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
- return 0;
-}
-
-/*
* Per-CPU kernel thread that invokes RCU callbacks. This replaces the
* RCU softirq used in flavors and configurations of RCU that do not
* support RCU priority boosting.
*/
-static int rcu_cpu_kthread(void *arg)
+static int rcu_cpu_kthread(void *cookie)
{
- int cpu = (int)(long)arg;
unsigned long flags;
+ struct sched_param sp;
int spincnt = 0;
- unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
+ unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
char work;
- char *workp = &per_cpu(rcu_cpu_has_work, cpu);
+ char *workp = &__get_cpu_var(rcu_cpu_has_work);

- trace_rcu_utilization("Start CPU kthread@init");
+ trace_rcu_utilization("Start CPU kthread@unpark");
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
for (;;) {
*statusp = RCU_KTHREAD_WAITING;
trace_rcu_utilization("End CPU kthread@rcu_wait");
- rcu_wait(*workp != 0 || kthread_should_stop());
+ rcu_wait(*workp != 0 ||
+ smpboot_thread_check_parking(cookie));
trace_rcu_utilization("Start CPU kthread@rcu_wait");
local_bh_disable();
- if (rcu_cpu_kthread_should_stop(cpu)) {
+ if (smpboot_thread_check_parking(cookie)) {
local_bh_enable();
break;
}
*statusp = RCU_KTHREAD_RUNNING;
- per_cpu(rcu_cpu_kthread_loops, cpu)++;
+ this_cpu_inc(rcu_cpu_kthread_loops);
local_irq_save(flags);
work = *workp;
*workp = 0;
@@ -1624,59 +1554,14 @@ static int rcu_cpu_kthread(void *arg)
if (spincnt > 10) {
*statusp = RCU_KTHREAD_YIELDING;
trace_rcu_utilization("End CPU kthread@rcu_yield");
- rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
+ rcu_yield(rcu_cpu_kthread_timer,
+ smp_processor_id());
trace_rcu_utilization("Start CPU kthread@rcu_yield");
spincnt = 0;
}
}
*statusp = RCU_KTHREAD_STOPPED;
- trace_rcu_utilization("End CPU kthread@term");
- return 0;
-}
-
-/*
- * Spawn a per-CPU kthread, setting up affinity and priority.
- * Because the CPU hotplug lock is held, no other CPU will be attempting
- * to manipulate rcu_cpu_kthread_task. There might be another CPU
- * attempting to access it during boot, but the locking in kthread_bind()
- * will enforce sufficient ordering.
- *
- * Please note that we cannot simply refuse to wake up the per-CPU
- * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
- * which can result in softlockup complaints if the task ends up being
- * idle for more than a couple of minutes.
- *
- * However, please note also that we cannot bind the per-CPU kthread to its
- * CPU until that CPU is fully online. We also cannot wait until the
- * CPU is fully online before we create its per-CPU kthread, as this would
- * deadlock the system when CPU notifiers tried waiting for grace
- * periods. So we bind the per-CPU kthread to its CPU only if the CPU
- * is online. If its CPU is not yet fully online, then the code in
- * rcu_cpu_kthread() will wait until it is fully online, and then do
- * the binding.
- */
-static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
-{
- struct sched_param sp;
- struct task_struct *t;
-
- if (!rcu_scheduler_fully_active ||
- per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
- return 0;
- t = kthread_create_on_node(rcu_cpu_kthread,
- (void *)(long)cpu,
- cpu_to_node(cpu),
- "rcuc/%d", cpu);
- if (IS_ERR(t))
- return PTR_ERR(t);
- if (cpu_online(cpu))
- kthread_bind(t, cpu);
- per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
- WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
- sp.sched_priority = RCU_KTHREAD_PRIO;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- per_cpu(rcu_cpu_kthread_task, cpu) = t;
- wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
+ trace_rcu_utilization("End CPU kthread@park");
return 0;
}

@@ -1788,6 +1673,12 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
}

+static struct smp_hotplug_thread rcu_cpu_thread_spec = {
+ .store = &rcu_cpu_kthread_task,
+ .thread_fn = rcu_cpu_kthread,
+ .thread_comm = "rcuc/%u",
+};
+
/*
* Spawn all kthreads -- called as soon as the scheduler is running.
*/
@@ -1797,11 +1688,9 @@ static int __init rcu_spawn_kthreads(void)
struct rcu_node *rnp;

rcu_scheduler_fully_active = 1;
- for_each_possible_cpu(cpu) {
+ BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
+ for_each_possible_cpu(cpu)
per_cpu(rcu_cpu_has_work, cpu) = 0;
- if (cpu_online(cpu))
- (void)rcu_spawn_one_cpu_kthread(cpu);
- }
rnp = rcu_get_root(rcu_state);
(void)rcu_spawn_one_node_kthread(rcu_state, rnp);
if (NUM_RCU_NODES > 1) {
@@ -1818,11 +1707,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
struct rcu_node *rnp = rdp->mynode;

/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
- if (rcu_scheduler_fully_active) {
- (void)rcu_spawn_one_cpu_kthread(cpu);
- if (rnp->node_kthread_task == NULL)
- (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
- }
+ if (rcu_scheduler_fully_active &&
+ rnp->node_kthread_task == NULL)
+ (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
}

#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1846,22 +1733,10 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
{
}

-#ifdef CONFIG_HOTPLUG_CPU
-
-static void rcu_stop_cpu_kthread(int cpu)
-{
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
}

-static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
-{
-}
-
static int __init rcu_scheduler_really_started(void)
{
rcu_scheduler_fully_active = 1;
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d4bc16d..6b4c76b 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -83,11 +83,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
rdp->nxttail[RCU_WAIT_TAIL]],
".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
#ifdef CONFIG_RCU_BOOST
- seq_printf(m, " kt=%d/%c/%d ktl=%x",
+ seq_printf(m, " kt=%d/%c ktl=%x",
per_cpu(rcu_cpu_has_work, rdp->cpu),
convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
rdp->cpu)),
- per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
#endif /* #ifdef CONFIG_RCU_BOOST */
seq_printf(m, " b=%ld", rdp->blimit);

2012-06-15 01:53:39

by Tejun Heo

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Thu, Jun 14, 2012 at 10:17:28AM +0200, Peter Zijlstra wrote:
> On Thu, 2012-06-14 at 10:08 +0200, Peter Zijlstra wrote:
> > On Wed, 2012-06-13 at 20:56 +0200, Thomas Gleixner wrote:
> > > If it's just a spurious wakeup then it goes back to sleep right away
> > > as nothing cleared the park bit.
> >
> > Your spurious wakeup will have destroyed the binding though. So you need
> > to be careful.
>
> We should probably do something like the below..
>
> TJ does this wreck workqueues? Its somewhat 'creative' in that regard
> and really wants fixing.
>
> ---
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5018,6 +5018,8 @@ void do_set_cpus_allowed(struct task_str
>
> cpumask_copy(&p->cpus_allowed, new_mask);
> p->nr_cpus_allowed = cpumask_weight(new_mask);
> + if (p->nr_cpus_allowed != 1)
> + p->flags &= ~PF_THREAD_BOUND;

The only reason wq workers use PF_THREAD_BOUND is to prevent userland
from mucking with cpus_allowed, so the above wouldn't break anything
in itself although userland would be able to wreck it afterwards.

Thanks.

--
tejun

2012-06-15 09:58:16

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Fri, 2012-06-15 at 10:53 +0900, Tejun Heo wrote:
> On Thu, Jun 14, 2012 at 10:17:28AM +0200, Peter Zijlstra wrote:
> > On Thu, 2012-06-14 at 10:08 +0200, Peter Zijlstra wrote:
> > > On Wed, 2012-06-13 at 20:56 +0200, Thomas Gleixner wrote:
> > > > If it's just a spurious wakeup then it goes back to sleep right away
> > > > as nothing cleared the park bit.
> > >
> > > Your spurious wakeup will have destroyed the binding though. So you need
> > > to be careful.
> >
> > We should probably do something like the below..
> >
> > TJ does this wreck workqueues? Its somewhat 'creative' in that regard
> > and really wants fixing.
> >
> > ---
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -5018,6 +5018,8 @@ void do_set_cpus_allowed(struct task_str
> >
> > cpumask_copy(&p->cpus_allowed, new_mask);
> > p->nr_cpus_allowed = cpumask_weight(new_mask);
> > + if (p->nr_cpus_allowed != 1)
> > + p->flags &= ~PF_THREAD_BOUND;
>
> The only reason wq workers use PF_THREAD_BOUND is to prevent userland
> from mucking with cpus_allowed, so the above wouldn't break anything
> in itself although userland would be able to wreck it afterwards.

Thing is, if things could get wrecked by userland moving a thread to a
different cpu, things just got wrecked by the kernel doing that very
same thing.

PF_THREAD_BOUND isn't called PF_NO_USER_AFFINITY (although it seems a
popular interpretation).

2012-06-18 08:47:21

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Wed, Jun 13, 2012 at 11:00:54AM -0000, Thomas Gleixner wrote:
> +
> +static int
> +__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
> +{
> + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
> + struct smpboot_thread_data *td;
> +
> + if (tsk)
> + return 0;
> +
> + td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
> + if (!td)
> + return -ENOMEM;
> + td->cpu = cpu;
> + td->ht = ht;
> +
> + tsk = kthread_create_on_cpu(ht->thread_fn, td, cpu, ht->thread_comm);
> + if (IS_ERR(tsk))
> + return PTR_ERR(tsk);
> +

Hi Thomas, I might be missing something obvious but will not we leak td
allocated here if kthread_create_on_cpu failed?

Cyrill

2012-06-18 08:50:56

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC patch 2/5] smpboot: Provide infrastructure for percpu hotplug threads

On Mon, 18 Jun 2012, Cyrill Gorcunov wrote:
> On Wed, Jun 13, 2012 at 11:00:54AM -0000, Thomas Gleixner wrote:
> > +
> > +static int
> > +__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
> > +{
> > + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
> > + struct smpboot_thread_data *td;
> > +
> > + if (tsk)
> > + return 0;
> > +
> > + td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
> > + if (!td)
> > + return -ENOMEM;
> > + td->cpu = cpu;
> > + td->ht = ht;
> > +
> > + tsk = kthread_create_on_cpu(ht->thread_fn, td, cpu, ht->thread_comm);
> > + if (IS_ERR(tsk))
> > + return PTR_ERR(tsk);
> > +
>
> Hi Thomas, I might be missing something obvious but will not we leak td
> allocated here if kthread_create_on_cpu failed?

Yes. Good catch!

Thanks,

tglx