2010-04-30 18:02:54

by Steffen Klassert

[permalink] [raw]
Subject: [PATCH 0/8] padata: fixes/cleanups

This pachset is based on Andrew's padata review. It applies agains the
cryptodev-2.6 tree. Btw. how to route future padata patches upstream,
still via cryptodev-2.6?

Steffen


2010-04-30 16:47:56

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 0/8] padata: fixes/cleanups

On Thu, 29 Apr 2010 14:36:36 +0200
Steffen Klassert <[email protected]> wrote:

> This pachset is based on Andrew's padata review. It applies agains the
> cryptodev-2.6 tree. Btw. how to route future padata patches upstream,
> still via cryptodev-2.6?

That's OK by me. I'll often grab stuff (especially if I was cc'ed on
it) and will drop it again if I see a subsystem maintainer took it.
People use this to delegate the resend-to-sleepy-maintainer job to
myself. Herbert is less sleepy than many ;)

2010-04-30 17:08:43

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 6/8] padata: Use a timer to handle the reorder queues

On Thu, 29 Apr 2010 14:43:37 +0200
Steffen Klassert <[email protected]> wrote:

> padata_get_next had a bogus check that returned always true,
> so the try_again loop in padata_reorder was never taken.

A better changelog would have told us what this "bogus check" _is_.

> This can lead to object leaks in some rare cases.

And a better changelog would describe those leaks!

> This patch
> implements a timer that processes the reorder queues if noone
> else does it in appropriate time.

Under what circumstances would "noone else do it in appropriate time"?
Would that be a bug, or what?

> @@ -273,13 +274,22 @@ try_again:
>
> spin_unlock_bh(&pd->lock);
>
> - if (atomic_read(&pd->reorder_objects))
> - goto try_again;
> + if (atomic_read(&pd->reorder_objects)
> + && !(pinst->flags & PADATA_RESET))
> + mod_timer(&pd->timer, jiffies + HZ);
> + else
> + del_timer(&pd->timer);
>
> -out:
> return;
> }

I'd feel more comfortable if the above was in the locked region. Is
there a race whereby another CPU can set pd->reorder_objects, but we
forgot to arm the timer?

2010-04-30 17:29:21

by Steffen Klassert

[permalink] [raw]
Subject: [PATCH 8/8] padata: Add some code comments


Signed-off-by: Steffen Klassert <[email protected]>
---
include/linux/padata.h | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
kernel/padata.c | 46 +++++++++++++++++++++++++++++++++++++++-
2 files changed, 97 insertions(+), 2 deletions(-)

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 64836a6..e8aac0f 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -26,6 +26,17 @@
#include <linux/list.h>
#include <linux/timer.h>

+/*
+ * struct padata_priv - Embedded to the users data structure.
+ *
+ * @list: List entry, to attach to the padata lists.
+ * @pd: Pointer to the internal control structure.
+ * @cb_cpu: Callback cpu for serializatioon.
+ * @seq_nr: Sequence number of the parallelized data object.
+ * @info: Used to pass information from the parallel to the serial function.
+ * @parallel: Parallel execution function.
+ * @serial: Serial complete function.
+ */
struct padata_priv {
struct list_head list;
struct parallel_data *pd;
@@ -36,11 +47,29 @@ struct padata_priv {
void (*serial)(struct padata_priv *padata);
};

+/*
+ * struct padata_list
+ *
+ * @list: List head.
+ * @lock: List lock.
+ */
struct padata_list {
struct list_head list;
spinlock_t lock;
};

+/*
+ * struct padata_queue - The percpu padata queues.
+ *
+ * @parallel: List to wait for parallelization.
+ * @reorder: List to wait for reordering after parallel processing.
+ * @serial: List to wait for serialization after reordering.
+ * @pwork: work struct for parallelization.
+ * @swork: work struct for serialization.
+ * @pd: Backpointer to the internal control structure.
+ * @num_obj: Number of objects that are processed by this cpu.
+ * @cpu_index: Index of the cpu.
+ */
struct padata_queue {
struct padata_list parallel;
struct padata_list reorder;
@@ -52,6 +81,20 @@ struct padata_queue {
int cpu_index;
};

+/*
+ * struct parallel_data - Internal control structure, covers everything
+ * that depends on the cpumask in use.
+ *
+ * @pinst: padata instance.
+ * @queue: percpu padata queues.
+ * @seq_nr: The sequence number that will be attached to the next object.
+ * @reorder_objects: Number of objects waiting in the reorder queues.
+ * @refcnt: Number of objects holding a reference on this parallel_data.
+ * @max_seq_nr: Maximal used sequence number.
+ * @cpumask: cpumask in use.
+ * @lock: Reorder lock.
+ * @timer: Reorder timer.
+ */
struct parallel_data {
struct padata_instance *pinst;
struct padata_queue *queue;
@@ -64,6 +107,16 @@ struct parallel_data {
struct timer_list timer;
};

+/*
+ * struct padata_instance - The overall control structure.
+ *
+ * @cpu_notifier: cpu hotplug notifier.
+ * @wq: The workqueue in use.
+ * @pd: The internal control structure.
+ * @cpumask: User supplied cpumask.
+ * @lock: padata instance lock.
+ * @flags: padata flags.
+ */
struct padata_instance {
struct notifier_block cpu_notifier;
struct workqueue_struct *wq;
diff --git a/kernel/padata.c b/kernel/padata.c
index ec6b8b7..fd09ea1 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -152,6 +152,23 @@ out:
}
EXPORT_SYMBOL(padata_do_parallel);

+/*
+ * padata_get_next - Get the next object that needs serialization.
+ *
+ * Return values are:
+ *
+ * A pointer to the control struct of the next object that needs
+ * serialization, if present in one of the percpu reorder queues.
+ *
+ * NULL, if all percpu reorder queues are empty.
+ *
+ * -EINPROGRESS, if the next object that needs serialization will
+ * be parallel processed by another cpu and is not yet present in
+ * the cpu's reorder queue.
+ *
+ * -ENODATA, if this cpu has to do the parallel processing for
+ * the next object.
+ */
static struct padata_priv *padata_get_next(struct parallel_data *pd)
{
int cpu, num_cpus, empty, calc_seq_nr;
@@ -173,7 +190,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)

/*
* Calculate the seq_nr of the object that should be
- * next in this queue.
+ * next in this reorder queue.
*/
overrun = 0;
calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
@@ -248,15 +265,32 @@ static void padata_reorder(struct parallel_data *pd)
struct padata_queue *queue;
struct padata_instance *pinst = pd->pinst;

+ /*
+ * The cpu that takes the trylock cares for all objects that
+ * are enqueued to the percpu reorder queues during the lock is held.
+ * So if a cpu can't get the trylock we exit, because the holder
+ * of the lock does the work for us if processing is needed.
+ */
if (!spin_trylock_bh(&pd->lock))
return;

while (1) {
padata = padata_get_next(pd);

+ /*
+ * All reorder queues are empty, or the next object that needs
+ * serialization is parallel processed by another cpu and is
+ * still on it's way to the cpu's reorder queue, nothing to
+ * do for now.
+ */
if (!padata || PTR_ERR(padata) == -EINPROGRESS)
break;

+ /*
+ * This cpu has to do the parallel processing of the next
+ * object. It's waiting in the cpu's parallelization queue,
+ * so exit imediately.
+ */
if (PTR_ERR(padata) == -ENODATA) {
del_timer(&pd->timer);
spin_unlock_bh(&pd->lock);
@@ -274,6 +308,11 @@ static void padata_reorder(struct parallel_data *pd)

spin_unlock_bh(&pd->lock);

+ /*
+ * The next object that needs serialization might have arrived to
+ * the reorder queues in the meantime, we will be called again
+ * from the timer function if noone else cares for it.
+ */
if (atomic_read(&pd->reorder_objects)
&& !(pinst->flags & PADATA_RESET))
mod_timer(&pd->timer, jiffies + HZ);
@@ -348,6 +387,7 @@ void padata_do_serial(struct padata_priv *padata)
}
EXPORT_SYMBOL(padata_do_serial);

+/* Allocate and initialize the internal cpumask dependend resources. */
static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
const struct cpumask *cpumask)
{
@@ -417,6 +457,7 @@ static void padata_free_pd(struct parallel_data *pd)
kfree(pd);
}

+/* Flush all objects out of the padata queues. */
static void padata_flush_queues(struct parallel_data *pd)
{
int cpu;
@@ -440,6 +481,7 @@ static void padata_flush_queues(struct parallel_data *pd)
BUG_ON(atomic_read(&pd->refcnt) != 0);
}

+/* Replace the internal control stucture with a new one. */
static void padata_replace(struct padata_instance *pinst,
struct parallel_data *pd_new)
{
@@ -706,7 +748,7 @@ EXPORT_SYMBOL(padata_alloc);
/*
* padata_free - free a padata instance
*
- * @ padata_inst: padata instance to free
+ * @padata_inst: padata instance to free
*/
void padata_free(struct padata_instance *pinst)
{
--
1.5.6.5

2010-04-30 17:29:32

by Steffen Klassert

[permalink] [raw]
Subject: Re: [PATCH 7/8] padata: Flush the padata queues actively

On Thu, Apr 29, 2010 at 04:11:12PM -0700, Andrew Morton wrote:
> On Thu, 29 Apr 2010 14:44:26 +0200
> Steffen Klassert <[email protected]> wrote:
>
> > +static void padata_flush_queues(struct parallel_data *pd)
> > +{
> > + int cpu;
> > + struct padata_queue *queue;
> > +
> > + for_each_cpu(cpu, pd->cpumask) {
> > + queue = per_cpu_ptr(pd->queue, cpu);
> > + flush_work(&queue->pwork);
> > + }
> > +
> > + del_timer_sync(&pd->timer);
> > +
> > + if (atomic_read(&pd->reorder_objects))
> > + padata_reorder(pd);
>
> padata_reorder() can fail to do anything, if someone else is holding
> pd->lock. What happens then?
>

padata does not accept new objects for parallelization if padata_flush_queues
is called. The way of the data objects throught the padata queues is

--> parallelization queue -> reorder queue -> serialization queue -->

So padata_flush_queues processes the objects in the parallelization queue
by doing flush_work(&queue->pwork). Then we delete the timer and wait on a
potentially running timer function. We are not accepting new objects
and the parallelization queue is empty, so the lock must be free then.

>
> > + for_each_cpu(cpu, pd->cpumask) {
> > + queue = per_cpu_ptr(pd->queue, cpu);
> > + flush_work(&queue->swork);
> > + }
> > + BUG_ON(atomic_read(&pd->refcnt) != 0);
> > +}
>
> Are we safe against cpu hot-unplug in this code?

padata_flush_queues is called after a call to get_online_cpus in all but
one cases. I just noticed that I forgot to add the
get_online_cpus/put_online_cpus in padata_free. I'll update the
get_online_cpus/put_online_cpus patch accordingly, then it should be save in
all cases.

2010-04-30 18:04:50

by Steffen Klassert

[permalink] [raw]
Subject: [PATCH 4/8] padata: Initialize the padata queues only for the used cpus

padata_alloc_pd set up queues for all possible cpus.
This patch changes this to set up the queues just for
the used cpus.

Signed-off-by: Steffen Klassert <[email protected]>
---
kernel/padata.c | 14 +++++---------
1 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/kernel/padata.c b/kernel/padata.c
index 5fa6ba6..fc9f19a 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -358,17 +358,15 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
goto err_free_queue;

- for_each_possible_cpu(cpu) {
+ cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
+
+ for_each_cpu(cpu, pd->cpumask) {
queue = per_cpu_ptr(pd->queue, cpu);

queue->pd = pd;

- if (cpumask_test_cpu(cpu, cpumask)
- && cpumask_test_cpu(cpu, cpu_active_mask)) {
- queue->cpu_index = cpu_index;
- cpu_index++;
- } else
- queue->cpu_index = -1;
+ queue->cpu_index = cpu_index;
+ cpu_index++;

INIT_LIST_HEAD(&queue->reorder.list);
INIT_LIST_HEAD(&queue->parallel.list);
@@ -382,8 +380,6 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
atomic_set(&queue->num_obj, 0);
}

- cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
-
num_cpus = cpumask_weight(pd->cpumask);
pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;

--
1.5.6.5

2010-04-30 18:04:59

by Steffen Klassert

[permalink] [raw]
Subject: [PATCH 2/8] padata: cpu hotplug code should depend on CONFIG_HOTPLUG_CPU

This patch makes the padata cpu hotplug code dependend on CONFIG_HOTPLUG_CPU.

Signed-off-by: Steffen Klassert <[email protected]>
---
kernel/padata.c | 13 +++++++------
1 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/kernel/padata.c b/kernel/padata.c
index 5b44d0f..1209a17 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -570,6 +570,7 @@ void padata_stop(struct padata_instance *pinst)
}
EXPORT_SYMBOL(padata_stop);

+#ifdef CONFIG_HOTPLUG_CPU
static int padata_cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
@@ -621,6 +622,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,

return NOTIFY_OK;
}
+#endif

/*
* padata_alloc - allocate and initialize a padata instance
@@ -631,7 +633,6 @@ static int padata_cpu_callback(struct notifier_block *nfb,
struct padata_instance *padata_alloc(const struct cpumask *cpumask,
struct workqueue_struct *wq)
{
- int err;
struct padata_instance *pinst;
struct parallel_data *pd;

@@ -654,18 +655,16 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,

pinst->flags = 0;

+#ifdef CONFIG_HOTPLUG_CPU
pinst->cpu_notifier.notifier_call = padata_cpu_callback;
pinst->cpu_notifier.priority = 0;
- err = register_hotcpu_notifier(&pinst->cpu_notifier);
- if (err)
- goto err_free_cpumask;
+ register_hotcpu_notifier(&pinst->cpu_notifier);
+#endif

mutex_init(&pinst->lock);

return pinst;

-err_free_cpumask:
- free_cpumask_var(pinst->cpumask);
err_free_pd:
padata_free_pd(pd);
err_free_inst:
@@ -689,7 +688,9 @@ void padata_free(struct padata_instance *pinst)
while (atomic_read(&pinst->pd->refcnt) != 0)
yield();

+#ifdef CONFIG_HOTPLUG_CPU
unregister_hotcpu_notifier(&pinst->cpu_notifier);
+#endif
padata_free_pd(pinst->pd);
free_cpumask_var(pinst->cpumask);
kfree(pinst);
--
1.5.6.5

2010-04-30 18:05:09

by Steffen Klassert

[permalink] [raw]
Subject: [PATCH 1/8] padata: Dont scale the parallel objects with the cpus

Scaling the maximum number of objects in the parallel
codepath can lead to out of memory problems on bigsmp
machines.

Signed-off-by: Steffen Klassert <[email protected]>
---
kernel/padata.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/padata.c b/kernel/padata.c
index 70b5d5e..5b44d0f 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -29,7 +29,7 @@
#include <linux/rcupdate.h>

#define MAX_SEQ_NR INT_MAX - NR_CPUS
-#define MAX_OBJ_NUM 10000 * NR_CPUS
+#define MAX_OBJ_NUM 1000

static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
{
--
1.5.6.5

2010-04-30 18:06:09

by Steffen Klassert

[permalink] [raw]
Subject: [PATCH 3/8] padata: Remove superfluous might_sleep

might_sleep() was placed before mutex_lock() in some places.
We remove them because mutex_lock() does might_sleep() too.

Signed-off-by: Steffen Klassert <[email protected]>
---
kernel/padata.c | 10 ----------
1 files changed, 0 insertions(+), 10 deletions(-)

diff --git a/kernel/padata.c b/kernel/padata.c
index 1209a17..5fa6ba6 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -443,8 +443,6 @@ int padata_set_cpumask(struct padata_instance *pinst,
struct parallel_data *pd;
int err = 0;

- might_sleep();
-
mutex_lock(&pinst->lock);

pd = padata_alloc_pd(pinst, cpumask);
@@ -489,8 +487,6 @@ int padata_add_cpu(struct padata_instance *pinst, int cpu)
{
int err;

- might_sleep();
-
mutex_lock(&pinst->lock);

cpumask_set_cpu(cpu, pinst->cpumask);
@@ -527,8 +523,6 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu)
{
int err;

- might_sleep();
-
mutex_lock(&pinst->lock);

cpumask_clear_cpu(cpu, pinst->cpumask);
@@ -547,8 +541,6 @@ EXPORT_SYMBOL(padata_remove_cpu);
*/
void padata_start(struct padata_instance *pinst)
{
- might_sleep();
-
mutex_lock(&pinst->lock);
pinst->flags |= PADATA_INIT;
mutex_unlock(&pinst->lock);
@@ -562,8 +554,6 @@ EXPORT_SYMBOL(padata_start);
*/
void padata_stop(struct padata_instance *pinst)
{
- might_sleep();
-
mutex_lock(&pinst->lock);
pinst->flags &= ~PADATA_INIT;
mutex_unlock(&pinst->lock);
--
1.5.6.5

2010-04-30 18:18:06

by Steffen Klassert

[permalink] [raw]
Subject: [PATCH 6/8] padata: Use a timer to handle the reorder queues

padata_get_next had a bogus check that returned always true,
so the try_again loop in padata_reorder was never taken.
This can lead to object leaks in some rare cases. This patch
implements a timer that processes the reorder queues if noone
else does it in appropriate time.

Signed-off-by: Steffen Klassert <[email protected]>
---
include/linux/padata.h | 2 ++
kernel/padata.c | 25 ++++++++++++++++++-------
2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 51611da..64836a6 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -24,6 +24,7 @@
#include <linux/workqueue.h>
#include <linux/spinlock.h>
#include <linux/list.h>
+#include <linux/timer.h>

struct padata_priv {
struct list_head list;
@@ -60,6 +61,7 @@ struct parallel_data {
unsigned int max_seq_nr;
cpumask_var_t cpumask;
spinlock_t lock;
+ struct timer_list timer;
};

struct padata_instance {
diff --git a/kernel/padata.c b/kernel/padata.c
index 82958e0..6d7ea48 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -231,7 +231,8 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
goto out;
}

- if (next_nr % num_cpus == next_queue->cpu_index) {
+ queue = per_cpu_ptr(pd->queue, smp_processor_id());
+ if (queue->cpu_index == next_queue->cpu_index) {
padata = ERR_PTR(-ENODATA);
goto out;
}
@@ -247,9 +248,8 @@ static void padata_reorder(struct parallel_data *pd)
struct padata_queue *queue;
struct padata_instance *pinst = pd->pinst;

-try_again:
if (!spin_trylock_bh(&pd->lock))
- goto out;
+ return;

while (1) {
padata = padata_get_next(pd);
@@ -258,8 +258,9 @@ try_again:
break;

if (PTR_ERR(padata) == -ENODATA) {
+ del_timer(&pd->timer);
spin_unlock_bh(&pd->lock);
- goto out;
+ return;
}

queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
@@ -273,13 +274,22 @@ try_again:

spin_unlock_bh(&pd->lock);

- if (atomic_read(&pd->reorder_objects))
- goto try_again;
+ if (atomic_read(&pd->reorder_objects)
+ && !(pinst->flags & PADATA_RESET))
+ mod_timer(&pd->timer, jiffies + HZ);
+ else
+ del_timer(&pd->timer);

-out:
return;
}

+static void padata_reorder_timer(unsigned long arg)
+{
+ struct parallel_data *pd = (struct parallel_data *)arg;
+
+ padata_reorder(pd);
+}
+
static void padata_serial_worker(struct work_struct *work)
{
struct padata_queue *queue;
@@ -383,6 +393,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
num_cpus = cpumask_weight(pd->cpumask);
pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;

+ setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
atomic_set(&pd->seq_nr, -1);
atomic_set(&pd->reorder_objects, 0);
atomic_set(&pd->refcnt, 0);
--
1.5.6.5

2010-04-30 18:18:36

by Steffen Klassert

[permalink] [raw]
Subject: [PATCH 5/8] padata: Use get_online_cpus/put_online_cpus

This patch puts get_online_cpus/put_online_cpus around the places
we modify the padata cpumask to ensure that no cpu goes offline
during this operation.

Signed-off-by: Steffen Klassert <[email protected]>
---
kernel/padata.c | 13 +++++++++++++
1 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/kernel/padata.c b/kernel/padata.c
index fc9f19a..82958e0 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -441,6 +441,8 @@ int padata_set_cpumask(struct padata_instance *pinst,

mutex_lock(&pinst->lock);

+ get_online_cpus();
+
pd = padata_alloc_pd(pinst, cpumask);
if (!pd) {
err = -ENOMEM;
@@ -452,6 +454,8 @@ int padata_set_cpumask(struct padata_instance *pinst,
padata_replace(pinst, pd);

out:
+ put_online_cpus();
+
mutex_unlock(&pinst->lock);

return err;
@@ -485,8 +489,10 @@ int padata_add_cpu(struct padata_instance *pinst, int cpu)

mutex_lock(&pinst->lock);

+ get_online_cpus();
cpumask_set_cpu(cpu, pinst->cpumask);
err = __padata_add_cpu(pinst, cpu);
+ put_online_cpus();

mutex_unlock(&pinst->lock);

@@ -521,8 +527,10 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu)

mutex_lock(&pinst->lock);

+ get_online_cpus();
cpumask_clear_cpu(cpu, pinst->cpumask);
err = __padata_remove_cpu(pinst, cpu);
+ put_online_cpus();

mutex_unlock(&pinst->lock);

@@ -626,6 +634,8 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
if (!pinst)
goto err;

+ get_online_cpus();
+
pd = padata_alloc_pd(pinst, cpumask);
if (!pd)
goto err_free_inst;
@@ -647,6 +657,8 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
register_hotcpu_notifier(&pinst->cpu_notifier);
#endif

+ put_online_cpus();
+
mutex_init(&pinst->lock);

return pinst;
@@ -655,6 +667,7 @@ err_free_pd:
padata_free_pd(pd);
err_free_inst:
kfree(pinst);
+ put_online_cpus();
err:
return NULL;
}
--
1.5.6.5

2010-04-30 18:18:48

by Steffen Klassert

[permalink] [raw]
Subject: Re: [PATCH 6/8] padata: Use a timer to handle the reorder queues

On Thu, Apr 29, 2010 at 04:06:44PM -0700, Andrew Morton wrote:
> On Thu, 29 Apr 2010 14:43:37 +0200
> Steffen Klassert <[email protected]> wrote:
>
> > padata_get_next had a bogus check that returned always true,
> > so the try_again loop in padata_reorder was never taken.
>
> A better changelog would have told us what this "bogus check" _is_.
>
> > This can lead to object leaks in some rare cases.
>
> And a better changelog would describe those leaks!

I'll try to write a better one and resent.

>
> > This patch
> > implements a timer that processes the reorder queues if noone
> > else does it in appropriate time.
>
> Under what circumstances would "noone else do it in appropriate time"?
> Would that be a bug, or what?
>

We need to ensure that only one cpu can work on dequeueing of the reorder
queue the time. Calculating in which percpu reorder queue the next object
will arrive takes some time. A spinlock would be highly contended. Also
it is not clear in which order the objects arrive to the reorder queues.
So a cpu could wait to get the lock just to notice that there is nothing to
do at the moment. Therefore we use a trylock and let the holder of the
lock care for all the objects enqueued during the holdtime of the lock.

The timer is to handle a race that appears with the trylock. If cpu1 queues
an object to the reorder queue while cpu2 holds the pd->lock but left the
while loop in padata_reorder already, cpu2 can't care for this object but cpu1
exits because it can't get the lock. Usually the next cpu that takes the
lock cares for this object too. We need the timer just if this object was the
last one that arrives to the reorder queues. The timer function sends it out
in this case.


> > @@ -273,13 +274,22 @@ try_again:
> >
> > spin_unlock_bh(&pd->lock);
> >
> > - if (atomic_read(&pd->reorder_objects))
> > - goto try_again;
> > + if (atomic_read(&pd->reorder_objects)
> > + && !(pinst->flags & PADATA_RESET))
> > + mod_timer(&pd->timer, jiffies + HZ);
> > + else
> > + del_timer(&pd->timer);
> >
> > -out:
> > return;
> > }
>
> I'd feel more comfortable if the above was in the locked region. Is
> there a race whereby another CPU can set pd->reorder_objects, but we
> forgot to arm the timer?
>


We could hit the race that the timer handles, if we move this into the lock.

cpu1 cpu2

spin_trylock_bh()
|
|
|
test pd->reorder_objects == 0
delete timer
|
hardinterrupt
| set pd->reorder_objects == 1
| enqueue object
| spin_trylock_bh() busy
| exit
|
spin_unlock_bh()

2010-04-30 18:19:02

by Steffen Klassert

[permalink] [raw]
Subject: [PATCH 7/8] padata: Flush the padata queues actively

yield was used to wait until all references of the internal control
structure in use are dropped before it is freed. This patch implements
padata_flush_queues which actively flushes the padata percpu queues
in this case.

Signed-off-by: Steffen Klassert <[email protected]>
---
kernel/padata.c | 33 +++++++++++++++++++++++++--------
1 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/kernel/padata.c b/kernel/padata.c
index 6d7ea48..ec6b8b7 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -417,6 +417,29 @@ static void padata_free_pd(struct parallel_data *pd)
kfree(pd);
}

+static void padata_flush_queues(struct parallel_data *pd)
+{
+ int cpu;
+ struct padata_queue *queue;
+
+ for_each_cpu(cpu, pd->cpumask) {
+ queue = per_cpu_ptr(pd->queue, cpu);
+ flush_work(&queue->pwork);
+ }
+
+ del_timer_sync(&pd->timer);
+
+ if (atomic_read(&pd->reorder_objects))
+ padata_reorder(pd);
+
+ for_each_cpu(cpu, pd->cpumask) {
+ queue = per_cpu_ptr(pd->queue, cpu);
+ flush_work(&queue->swork);
+ }
+
+ BUG_ON(atomic_read(&pd->refcnt) != 0);
+}
+
static void padata_replace(struct padata_instance *pinst,
struct parallel_data *pd_new)
{
@@ -428,11 +451,7 @@ static void padata_replace(struct padata_instance *pinst,

synchronize_rcu();

- while (atomic_read(&pd_old->refcnt) != 0)
- yield();
-
- flush_workqueue(pinst->wq);
-
+ padata_flush_queues(pd_old);
padata_free_pd(pd_old);

pinst->flags &= ~PADATA_RESET;
@@ -695,12 +714,10 @@ void padata_free(struct padata_instance *pinst)

synchronize_rcu();

- while (atomic_read(&pinst->pd->refcnt) != 0)
- yield();
-
#ifdef CONFIG_HOTPLUG_CPU
unregister_hotcpu_notifier(&pinst->cpu_notifier);
#endif
+ padata_flush_queues(pinst->pd);
padata_free_pd(pinst->pd);
free_cpumask_var(pinst->cpumask);
kfree(pinst);
--
1.5.6.5

2010-04-30 18:54:44

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 7/8] padata: Flush the padata queues actively

On Thu, 29 Apr 2010 14:44:26 +0200
Steffen Klassert <[email protected]> wrote:

> +static void padata_flush_queues(struct parallel_data *pd)
> +{
> + int cpu;
> + struct padata_queue *queue;
> +
> + for_each_cpu(cpu, pd->cpumask) {
> + queue = per_cpu_ptr(pd->queue, cpu);
> + flush_work(&queue->pwork);
> + }
> +
> + del_timer_sync(&pd->timer);
> +
> + if (atomic_read(&pd->reorder_objects))
> + padata_reorder(pd);

padata_reorder() can fail to do anything, if someone else is holding
pd->lock. What happens then?


> + for_each_cpu(cpu, pd->cpumask) {
> + queue = per_cpu_ptr(pd->queue, cpu);
> + flush_work(&queue->swork);
> + }
> + BUG_ON(atomic_read(&pd->refcnt) != 0);
> +}

Are we safe against cpu hot-unplug in this code?