LinuxLists.cc - [PATCH] RCU for low latency (experimental)

2004-03-23 10:18:52

Subject: [PATCH] RCU for low latency (experimental)

Here is the RCU patch for low scheduling latency Andrew was talking
about in the other thread. I had done some measurements with
amlat on a 2.4 GHz P4 xeon box with 256MB memory running dbench
and it reduced worst case scheduling latencies from 800 microseconds
to about 400 microseconds.

It uses per-cpu kernel threads to execute excess callbacks and
pretty much relies on preemption. I added a CONFIG_LOW_LATENCY
option to make this conditional. The amount of callbacks to
invoke in softirq before punting to krcud can be set at boot
time using rcupdate.bhlimit parameter. The whole thing is meant
for experimenting only. The negative side of doing RCU this way
is that we may further delay the grace period with the
RCU kernel thread and thus there can be OOM situations.

I would be interested in all issues with this patch including
latencies and OOM situations.

Dipankar

Reduce bh processing time of rcu callbacks by using tunable per-cpu
krcud daemeons.

include/linux/rcupdate.h | 4 ++
include/linux/sched.h | 1
init/Kconfig | 9 ++++
kernel/rcupdate.c | 91 +++++++++++++++++++++++++++++++++++++++++++++--
kernel/sched.c | 6 +++
5 files changed, 108 insertions(+), 3 deletions(-)

diff -puN include/linux/rcupdate.h~rcu-low-lat include/linux/rcupdate.h
--- linux-2.6.4-rcu/include/linux/rcupdate.h~rcu-low-lat 2004-03-23 15:20:11.000000000 +0530
+++ linux-2.6.4-rcu-dipankar/include/linux/rcupdate.h 2004-03-23 15:20:11.000000000 +0530
@@ -93,9 +93,11 @@ struct rcu_data {
long qsctr; /* User-mode/idle loop etc. */
long last_qsctr; /* value of qsctr at beginning */
/* of rcu grace period */
+ struct task_struct *krcud;
long batch; /* Batch # for current RCU batch */
struct list_head nxtlist;
struct list_head curlist;
+ struct list_head rcudlist;
};

DECLARE_PER_CPU(struct rcu_data, rcu_data);
@@ -103,9 +105,11 @@ extern struct rcu_ctrlblk rcu_ctrlblk;

#define RCU_qsctr(cpu) (per_cpu(rcu_data, (cpu)).qsctr)
#define RCU_last_qsctr(cpu) (per_cpu(rcu_data, (cpu)).last_qsctr)
+#define RCU_krcud(cpu) (per_cpu(rcu_data, (cpu)).krcud)
#define RCU_batch(cpu) (per_cpu(rcu_data, (cpu)).batch)
#define RCU_nxtlist(cpu) (per_cpu(rcu_data, (cpu)).nxtlist)
#define RCU_curlist(cpu) (per_cpu(rcu_data, (cpu)).curlist)
+#define RCU_rcudlist(cpu) (per_cpu(rcu_data, (cpu)).rcudlist)

#define RCU_QSCTR_INVALID 0

diff -puN include/linux/sched.h~rcu-low-lat include/linux/sched.h
--- linux-2.6.4-rcu/include/linux/sched.h~rcu-low-lat 2004-03-23 15:20:11.000000000 +0530
+++ linux-2.6.4-rcu-dipankar/include/linux/sched.h 2004-03-23 15:20:12.000000000 +0530
@@ -552,6 +552,7 @@ extern int task_prio(task_t *p);
extern int task_nice(task_t *p);
extern int task_curr(task_t *p);
extern int idle_cpu(int cpu);
+extern int rq_has_rt_task(int cpu);

void yield(void);

diff -puN init/Kconfig~rcu-low-lat init/Kconfig
--- linux-2.6.4-rcu/init/Kconfig~rcu-low-lat 2004-03-23 15:20:11.000000000 +0530
+++ linux-2.6.4-rcu-dipankar/init/Kconfig 2004-03-23 15:20:12.000000000 +0530
@@ -156,6 +156,14 @@ config HOTPLUG
agent" (/sbin/hotplug) to load modules and set up software needed
to use devices as you hotplug them.

+config LOW_LATENCY
+ bool "Enable kernel features for low scheduling latency" if EXPERIMENTAL
+ default n
+ ---help---
+ This option enables various features in the kernel that
+ help reduce scheduling latency while potentially sacrificing
+ throughput.
+
config IKCONFIG
bool "Kernel .config support"
---help---
@@ -181,7 +189,6 @@ config IKCONFIG_PROC
This option enables access to kernel configuration file and build
information through /proc/config.gz.

-
menuconfig EMBEDDED
bool "Remove kernel features (for embedded systems)"
help
diff -puN kernel/rcupdate.c~rcu-low-lat kernel/rcupdate.c
--- linux-2.6.4-rcu/kernel/rcupdate.c~rcu-low-lat 2004-03-23 15:20:11.000000000 +0530
+++ linux-2.6.4-rcu-dipankar/kernel/rcupdate.c 2004-03-23 15:21:12.000000000 +0530
@@ -39,6 +39,7 @@
#include <asm/atomic.h>
#include <asm/bitops.h>
#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/completion.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
@@ -54,6 +55,11 @@ DEFINE_PER_CPU(struct rcu_data, rcu_data
/* Fake initialization required by compiler */
static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
#define RCU_tasklet(cpu) (per_cpu(rcu_tasklet, cpu))
+#ifdef CONFIG_LOW_LATENCY
+static int bhlimit = 256;
+#else
+static int bhlimit = 0;
+#endif

/**
* call_rcu - Queue an RCU update request.
@@ -79,6 +85,13 @@ void fastcall call_rcu(struct rcu_head *
local_irq_restore(flags);
}

+static inline unsigned int rcu_bh_callback_limit(int cpu)
+{
+ if (in_softirq() && RCU_krcud(cpu))
+ return bhlimit;
+ return (unsigned int)-1;
+}
+
/*
* Invoke the completed RCU callbacks. They are expected to be in
* a per-cpu list.
@@ -87,13 +100,22 @@ static void rcu_do_batch(struct list_hea
{
struct list_head *entry;
struct rcu_head *head;
+ unsigned int count = 0;
+ int cpu = smp_processor_id();
+ unsigned int limit = rcu_bh_callback_limit(cpu);

while (!list_empty(list)) {
entry = list->next;
list_del(entry);
head = list_entry(entry, struct rcu_head, list);
head->func(head->arg);
+ if (++count > limit && rq_has_rt_task(cpu)) {
+ list_splice(list, &RCU_rcudlist(cpu));
+ wake_up_process(RCU_krcud(cpu));
+ break;
+ }
}
+
}

/*
@@ -198,12 +220,67 @@ void rcu_check_callbacks(int cpu, int us
tasklet_schedule(&RCU_tasklet(cpu));
}

+static int krcud(void * __bind_cpu)
+{
+ int cpu = (int) (long) __bind_cpu;
+
+ daemonize("krcud/%d", cpu);
+ set_user_nice(current, -19);
+ current->flags |= PF_IOTHREAD;
+
+ /* Migrate to the right CPU */
+ set_cpus_allowed(current, cpumask_of_cpu(cpu));
+ BUG_ON(smp_processor_id() != cpu);
+
+ __set_current_state(TASK_INTERRUPTIBLE);
+ mb();
+
+ RCU_krcud(cpu) = current;
+
+ for (;;) {
+ LIST_HEAD(list);
+
+ if (list_empty(&RCU_rcudlist(cpu)))
+ schedule();
+
+ __set_current_state(TASK_RUNNING);
+
+ local_bh_disable();
+ while (!list_empty(&RCU_rcudlist(cpu))) {
+ list_splice(&RCU_rcudlist(cpu), &list);
+ INIT_LIST_HEAD(&RCU_rcudlist(cpu));
+ local_bh_enable();
+ rcu_do_batch(&list);
+ cond_resched();
+ local_bh_disable();
+ }
+ local_bh_enable();
+
+ __set_current_state(TASK_INTERRUPTIBLE);
+ }
+}
+
+static int start_krcud(int cpu)
+{
+ if (bhlimit) {
+ if (kernel_thread(krcud, (void *)(long)cpu, CLONE_KERNEL) < 0) {
+ printk("krcud for %i failed\n", cpu);
+ return -1;
+ }
+
+ while (!RCU_krcud(cpu))
+ yield();
+ }
+ return 0;
+}
+
static void __devinit rcu_online_cpu(int cpu)
{
memset(&per_cpu(rcu_data, cpu), 0, sizeof(struct rcu_data));
tasklet_init(&RCU_tasklet(cpu), rcu_process_callbacks, 0UL);
INIT_LIST_HEAD(&RCU_nxtlist(cpu));
INIT_LIST_HEAD(&RCU_curlist(cpu));
+ INIT_LIST_HEAD(&RCU_rcudlist(cpu));
}

static int __devinit rcu_cpu_notify(struct notifier_block *self,
@@ -214,6 +291,10 @@ static int __devinit rcu_cpu_notify(stru
case CPU_UP_PREPARE:
rcu_online_cpu(cpu);
break;
+ case CPU_ONLINE:
+ if (start_krcud(cpu) != 0)
+ return NOTIFY_BAD;
+ break;
/* Space reserved for CPU_OFFLINE :) */
default:
break;
@@ -233,12 +314,17 @@ static struct notifier_block __devinitda
*/
void __init rcu_init(void)
{
- rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
- (void *)(long)smp_processor_id());
+ rcu_online_cpu(smp_processor_id());
/* Register notifier for non-boot CPUs */
register_cpu_notifier(&rcu_nb);
}

+static int __init rcu_late_init(void)
+{
+ return start_krcud(smp_processor_id());
+}
+
+__initcall(rcu_late_init);

/* Because of FASTCALL declaration of complete, we use this wrapper */
static void wakeme_after_rcu(void *completion)
@@ -262,6 +348,7 @@ void synchronize_kernel(void)
wait_for_completion(&completion);
}

+module_param(bhlimit, int, 0);

EXPORT_SYMBOL(call_rcu);
EXPORT_SYMBOL(synchronize_kernel);
diff -puN kernel/sched.c~rcu-low-lat kernel/sched.c
--- linux-2.6.4-rcu/kernel/sched.c~rcu-low-lat 2004-03-23 15:20:11.000000000 +0530
+++ linux-2.6.4-rcu-dipankar/kernel/sched.c 2004-03-23 15:20:12.000000000 +0530
@@ -341,6 +341,12 @@ static inline void enqueue_task(struct t
p->array = array;
}

+int rq_has_rt_task(int cpu)
+{
+ runqueue_t *rq = cpu_rq(cpu);
+ return (sched_find_first_bit(rq->active->bitmap) < MAX_RT_PRIO);
+}
+
/*
* effective_prio - return the priority that is based on the static
* priority but is modified by bonuses/penalties.

_

2004-03-23 10:26:00

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

Dipankar Sarma <[email protected]> wrote:
>
> Here is the RCU patch for low scheduling latency Andrew was talking
> about in the other thread. I had done some measurements with
> amlat on a 2.4 GHz P4 xeon box with 256MB memory running dbench
> and it reduced worst case scheduling latencies from 800 microseconds
> to about 400 microseconds.
>
> It uses per-cpu kernel threads to execute excess callbacks and
> pretty much relies on preemption.

Is simple enough. Do you expect this will help with the route cache
reaping problem? I do think it's a bit hard to justify purely on the basis
of the scheduling latency goodness.

> + list_splice(&RCU_rcudlist(cpu), &list);
> + INIT_LIST_HEAD(&RCU_rcudlist(cpu));

We have list_splice_init().

2004-03-23 10:35:36

by Arjan van de Ven

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

> Reduce bh processing time of rcu callbacks by using tunable per-cpu
> krcud daemeons.

why not just use work queues ?

Attachments:

signature.asc (189.00 B)
This is a digitally signed message part

2004-03-23 10:41:48

by Dipankar Sarma

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

On Tue, Mar 23, 2004 at 02:25:40AM -0800, Andrew Morton wrote:
> Dipankar Sarma <[email protected]> wrote:
> >
> > Here is the RCU patch for low scheduling latency Andrew was talking
> > about in the other thread. I had done some measurements with
> > amlat on a 2.4 GHz P4 xeon box with 256MB memory running dbench
> > and it reduced worst case scheduling latencies from 800 microseconds
> > to about 400 microseconds.
> >
> > It uses per-cpu kernel threads to execute excess callbacks and
> > pretty much relies on preemption.
>
> Is simple enough. Do you expect this will help with the route cache
> reaping problem? I do think it's a bit hard to justify purely on the basis
> of the scheduling latency goodness.

We have two somewhat overlapping problems to solve in our hands.
Latencies impeded by long running rcu softirqs and RCU itself
impeded by long running softirqs.

In the route cache DoS case, I have been experimenting with
various throttling mechanism and I consistently see 350ms odd
grace period irrespective of whether I have long running RCU
batches in softirq or not. I checked that by batching RCUs and
putting an interval of a few ticks between batches.
This leads me to believe that the only way to avoid the route
cache DoS is to reduce softirq load given a period in time and have more
balance in the system. I am working on some experimental code to throttle
softirqs and have some more fair use of CPU between softirqs
and process context code. To answer your question, I don't think
handing over to krcud will help, but it is definitely in my list
of things to experiment under DoS.

Anyway, I will mail out the results/experiments so far to lkml
and netdev.

Thanks
Dipankar

2004-03-23 10:46:59

by Dipankar Sarma

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

On Tue, Mar 23, 2004 at 11:35:06AM +0100, Arjan van de Ven wrote:
>
> > Reduce bh processing time of rcu callbacks by using tunable per-cpu
> > krcud daemeons.
>
> why not just use work queues ?

No particular reason other than that I wanted to experiment with
the priority of the kernel threads when we test it under heavy load.
If all these are deemed necessary, I will clean it up.

There is one minor irritant - I need to check if the worker thread
for my cpu is running or not. I will have to add something to do
that since this whole thing is conditional and also RCU is initialized
before workqueues.

Thanks
Dipankar

2004-03-23 12:28:36

by Andrea Arcangeli

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

On Tue, Mar 23, 2004 at 03:47:55PM +0530, Dipankar Sarma wrote:
> Here is the RCU patch for low scheduling latency Andrew was talking
> about in the other thread. I had done some measurements with

I don't see why you're using an additional kernel thread. I told you one
way to implement it via softirq taking advantage of the scheduler-friendy
re-arming tasklets.

2004-03-23 12:30:35

by Andrea Arcangeli

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

On Tue, Mar 23, 2004 at 11:35:06AM +0100, Arjan van de Ven wrote:
>
> > Reduce bh processing time of rcu callbacks by using tunable per-cpu
> > krcud daemeons.
>
> why not just use work queues ?

I don't know if work queues are scheduler friendly, but definitely the
rearming tasklets are. Running a dozen callbacks per pass and queueing
any remining work to a rearming tasklet should fix it.

2004-03-23 12:35:17

by Dipankar Sarma

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

On Tue, Mar 23, 2004 at 01:29:25PM +0100, Andrea Arcangeli wrote:
> On Tue, Mar 23, 2004 at 03:47:55PM +0530, Dipankar Sarma wrote:
> > Here is the RCU patch for low scheduling latency Andrew was talking
> > about in the other thread. I had done some measurements with
>
> I don't see why you're using an additional kernel thread. I told you one
> way to implement it via softirq taking advantage of the scheduler-friendy
> re-arming tasklets.

I have a patch for that too which I have been testing for DoS in
route cache, not latency. It is worth testing it here, however
I think re-arming tasklets is not as friendly to latency as
executing the rcu callbacks from process context. One thing
I have noticed is that more softirqs worsen latency irrespective
of the worst-case softirq length.

Thanks
Dipankar

2004-03-23 12:40:51

by Dipankar Sarma

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

On Tue, Mar 23, 2004 at 01:31:05PM +0100, Andrea Arcangeli wrote:
> On Tue, Mar 23, 2004 at 11:35:06AM +0100, Arjan van de Ven wrote:
> >
> > > Reduce bh processing time of rcu callbacks by using tunable per-cpu
> > > krcud daemeons.
> >
> > why not just use work queues ?
>
> I don't know if work queues are scheduler friendly, but definitely the
> rearming tasklets are. Running a dozen callbacks per pass and queueing
> any remining work to a rearming tasklet should fix it.

One problem that likely happen here is that under heavy interrupt
load, large number of softirqs still starve out user processes.
In my DoS testing setup, I see that limiting RCU softirqs
and re-arming tasklets has no effect on user process starvation.

Thanks
Dipankar

2004-03-23 12:41:16

by Arjan van de Ven

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

Attachments:

(No filename) (519.00 B)
(No filename) (189.00 B)
Download all attachments

2004-03-23 12:46:14

by Andrea Arcangeli

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

On Tue, Mar 23, 2004 at 06:04:26PM +0530, Dipankar Sarma wrote:
> I have a patch for that too which I have been testing for DoS in
> route cache, not latency. It is worth testing it here, however
> I think re-arming tasklets is not as friendly to latency as
> executing the rcu callbacks from process context. One thing
> I have noticed is that more softirqs worsen latency irrespective
> of the worst-case softirq length.

if we keep getting new interrupts in a flood, we'll also execute those
callbacks in a flood, that's why the number of callbacks executed must
be very small for every tasklet, the cost of the tasklet should be small
compared to the whole cost of the irq taking to hardware too, so I don't
see much problems with softirqs, if you don't get a flood of hw-irqs,
the callback load will be offloaded to ksoftirqd etc... softirqs are
also guaranteed to make progress despite not running RT. That's the
best for you. eventd must run RT. And if you don't make your userspace
krcud RT you can run a box out of memory with a RT application, while if
you make it RT you'll again generate the latencies making krcud useless.

this is btw, why I implemented rcu_poll using softirqs, problem is that
softirqs are so low latency that we couldn't coalesce many callbacks
together to maximze icache and so we've to reach less quiescient points
per second to do the same work etc... so we delay it at the next irq,
that's fine, but if there's too much work to do, going back to the
softirq model like in rcu_poll sounds the natural way to go.

2004-03-23 12:49:55

by Andrea Arcangeli

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

On Tue, Mar 23, 2004 at 06:10:02PM +0530, Dipankar Sarma wrote:
> On Tue, Mar 23, 2004 at 01:31:05PM +0100, Andrea Arcangeli wrote:
> > On Tue, Mar 23, 2004 at 11:35:06AM +0100, Arjan van de Ven wrote:
> > >
> > > > Reduce bh processing time of rcu callbacks by using tunable per-cpu
> > > > krcud daemeons.
> > >
> > > why not just use work queues ?
> >
> > I don't know if work queues are scheduler friendly, but definitely the
> > rearming tasklets are. Running a dozen callbacks per pass and queueing
> > any remining work to a rearming tasklet should fix it.
>
> One problem that likely happen here is that under heavy interrupt
> load, large number of softirqs still starve out user processes.

Disagree, run 1 callback per tasklet and then you will not measure the
cost of this callback compared to the cost of talking to the hardware
entering/exiting kernel etc...

> In my DoS testing setup, I see that limiting RCU softirqs
> and re-arming tasklets has no effect on user process starvation.

in an irq flood load that stalls userspace anyways it's ok to spread the
callback load into the irqs, 10 tasklets and in turn 10 callbacks per
irqs or so. That load isn't scheduler friendly anyways.

the one property you need is not to be RT like eventd, to be scheduler
friendly, but guaranteed to make progress too and that's what softirqs
can give you and that's why I used only softirqs in my rcu_poll
patches too ;).

2004-03-24 17:27:07

by Paul E. McKenney

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

> > One problem that likely happen here is that under heavy interrupt
> > load, large number of softirqs still starve out user processes.
>
> Disagree, run 1 callback per tasklet and then you will not measure the
> cost of this callback compared to the cost of talking to the hardware
> entering/exiting kernel etc...

The difficult situation is when the workload generates -lots- of
RCU callbacks, such as the tiny-files workload that Andrew pointed
Dipankar at. In this case, if we rely only on softirq, we are between
a rock and a hard place. The rock is that if we run too many
softirq handlers processing all the RCU callbacks, we will degrade
realtime response. The hard place is that if we delay softirq
processing in order to avoid degrading realtime response, we
risk having RCU callbacks piling up, exhausting memory.

This situation is what motivated using a per-CPU kernel daemon to
handle the "overflow" callbacks that could not handled by softirq
without degrading realtime response. Since the kernel daemon is
preemptible, it can run continuously without degrading realtime
response -- it will be preempted whenever a realtime task needs
to run. Therefore, the kernel-daemon approach permits safely
processing RCU callbacks up to the full capacity of the CPU.
My guess is that the small-file creation/deletion workload can
generate on the order of 100K RCU callbacks per second, and perhaps
as many as 1M RCU callbacks per second on a fast CPU. The kernel
daemon approach should be able to handle this load gracefully.

Of course, it would be possible to insert preemption points in
the kernel daemon should we choose to support realtime response
in absence of CONFIG_PREEMPT.

That said, it may well be necessary to use IPIs in some cases,
as rcu-poll does, to accelerate RCU grace periods. For example,
if low on memory or if too many callbacks have accumulated.
If this ends up being necessary, it might get the best of both
worlds, since one would shorten grace periods only when there
are already a lot of callbacks, so that the overhead would be
nicely amortized. It will be interesting to see how this goes!

> > In my DoS testing setup, I see that limiting RCU softirqs
> > and re-arming tasklets has no effect on user process starvation.
>
> in an irq flood load that stalls userspace anyways it's ok to spread the
> callback load into the irqs, 10 tasklets and in turn 10 callbacks per
> irqs or so. That load isn't scheduler friendly anyways.

The goal is to run reasonably, even under this workload, which, as you
say is not scheduler friendly. Scheduler hostile, in fact. ;-)

> the one property you need is not to be RT like eventd, to be scheduler
> friendly, but guaranteed to make progress too and that's what softirqs
> can give you and that's why I used only softirqs in my rcu_poll
> patches too ;).

The problem is that some of the workloads generate thousands of
RCU callbacks per grace period. If we are going to provide
realtime scheduling latencies in the hundreds of microseconds, we
probably aren't going to get away with executing all of these
callbacks in softirq context.

Thanx, Paul

2004-03-24 17:50:56

by Andrea Arcangeli

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

On Wed, Mar 24, 2004 at 09:26:57AM -0800, Paul E. McKenney wrote:
> > > One problem that likely happen here is that under heavy interrupt
> > > load, large number of softirqs still starve out user processes.
> >
> > Disagree, run 1 callback per tasklet and then you will not measure the
> > cost of this callback compared to the cost of talking to the hardware
> > entering/exiting kernel etc...
>
> The difficult situation is when the workload generates -lots- of
> RCU callbacks, such as the tiny-files workload that Andrew pointed
> Dipankar at. In this case, if we rely only on softirq, we are between
> a rock and a hard place. The rock is that if we run too many
> softirq handlers processing all the RCU callbacks, we will degrade
> realtime response. The hard place is that if we delay softirq
> processing in order to avoid degrading realtime response, we
> risk having RCU callbacks piling up, exhausting memory.
>
> This situation is what motivated using a per-CPU kernel daemon to
> handle the "overflow" callbacks that could not handled by softirq
> without degrading realtime response. Since the kernel daemon is
> preemptible, it can run continuously without degrading realtime
> response -- it will be preempted whenever a realtime task needs
> to run. Therefore, the kernel-daemon approach permits safely
> processing RCU callbacks up to the full capacity of the CPU.

and it tends to runs the machine OOM if there's a single RT application
generating the dcache load ;).

> My guess is that the small-file creation/deletion workload can
> generate on the order of 100K RCU callbacks per second, and perhaps
> as many as 1M RCU callbacks per second on a fast CPU. The kernel
> daemon approach should be able to handle this load gracefully.

running 1 callback per softirq (and in turn 10 callbacks per hardware
irq) shouldn't be measurable compared to the cost of the hardware
handling, skb memory allocation, iommu mapping etc...

why do you care about this specific irq-flood corner case where the load
is lost in the noise and there's no way to make it scheduler-friendy
either since hardware irqs are involved?

the only way to make that workload scheduler friendly, is to bind the
irq of the network card to cpu 1 and to bind the RT app to cpu0, no
other way around it, no matter where you run the rcu callbacks (if in
irq context, or non-irq context).

> Of course, it would be possible to insert preemption points in
> the kernel daemon should we choose to support realtime response
> in absence of CONFIG_PREEMPT.

2.6 support realtime response with PREEMPT=y and =n, infact preempt
doesn't affect the worst case RT latency at all, it can't.

So you shouldn't relay on preempt to avoid explicit schedule points
there.

> That said, it may well be necessary to use IPIs in some cases,
> as rcu-poll does, to accelerate RCU grace periods. For example,
> if low on memory or if too many callbacks have accumulated.
> If this ends up being necessary, it might get the best of both
> worlds, since one would shorten grace periods only when there
> are already a lot of callbacks, so that the overhead would be
> nicely amortized. It will be interesting to see how this goes!

I'm not sure in this case if IPI are needed just for offloading the
remaining work to a rearming tasklet.

> > > In my DoS testing setup, I see that limiting RCU softirqs
> > > and re-arming tasklets has no effect on user process starvation.
> >
> > in an irq flood load that stalls userspace anyways it's ok to spread the
> > callback load into the irqs, 10 tasklets and in turn 10 callbacks per
> > irqs or so. That load isn't scheduler friendly anyways.
>
> The goal is to run reasonably, even under this workload, which, as you
> say is not scheduler friendly. Scheduler hostile, in fact. ;-)

Indeed it is, and I'm simply expecting not any real difference by
running 10 callbacks per hardware irq, so I find it a non very
interesting workload to choose between a softirq or the kernel thread,
but maybe I'm overlooking something.

> > the one property you need is not to be RT like eventd, to be scheduler
> > friendly, but guaranteed to make progress too and that's what softirqs
> > can give you and that's why I used only softirqs in my rcu_poll
> > patches too ;).
>
> The problem is that some of the workloads generate thousands of
> RCU callbacks per grace period. If we are going to provide
> realtime scheduling latencies in the hundreds of microseconds, we
> probably aren't going to get away with executing all of these
> callbacks in softirq context.

it should, you just need to run 1 callback per re-arming tasklet (then
after the list is empty you stop re-arming), the softirq code will do
the rest for you offloading it immediatly to ksoftirqd after 10
callbacks, and ksoftirqd will reschedule explicitly once every 10
callbacks too. The whole point of ksoftirqd is to make re-arming
tasklets irq-friendy. Though there's a cost in offloading the work to a
daemon, so we must not do it too frequently, so we retry 10 times before
giving up and claiming the tasklet re-entrant.

2004-03-24 20:03:50

by Paul E. McKenney

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

On Wed, Mar 24, 2004 at 06:51:42PM +0100, Andrea Arcangeli wrote:
> On Wed, Mar 24, 2004 at 09:26:57AM -0800, Paul E. McKenney wrote:
> > > > One problem that likely happen here is that under heavy interrupt
> > > > load, large number of softirqs still starve out user processes.
> > >
> > > Disagree, run 1 callback per tasklet and then you will not measure the
> > > cost of this callback compared to the cost of talking to the hardware
> > > entering/exiting kernel etc...
> >
> > The difficult situation is when the workload generates -lots- of
> > RCU callbacks, such as the tiny-files workload that Andrew pointed
> > Dipankar at. In this case, if we rely only on softirq, we are between
> > a rock and a hard place. The rock is that if we run too many
> > softirq handlers processing all the RCU callbacks, we will degrade
> > realtime response. The hard place is that if we delay softirq
> > processing in order to avoid degrading realtime response, we
> > risk having RCU callbacks piling up, exhausting memory.
> >
> > This situation is what motivated using a per-CPU kernel daemon to
> > handle the "overflow" callbacks that could not handled by softirq
> > without degrading realtime response. Since the kernel daemon is
> > preemptible, it can run continuously without degrading realtime
> > response -- it will be preempted whenever a realtime task needs
> > to run. Therefore, the kernel-daemon approach permits safely
> > processing RCU callbacks up to the full capacity of the CPU.
>
> and it tends to runs the machine OOM if there's a single RT application
> generating the dcache load ;).

A CPU-bound RT application needs to run standalone on bare metal.
Realtime applications need to either leave enough CPU time for the
OS to get necessary housekeeping done, or they need to take
responsibility for doing everything, which means running without
an OS.

> > My guess is that the small-file creation/deletion workload can
> > generate on the order of 100K RCU callbacks per second, and perhaps
> > as many as 1M RCU callbacks per second on a fast CPU. The kernel
> > daemon approach should be able to handle this load gracefully.
>
> running 1 callback per softirq (and in turn 10 callbacks per hardware
> irq) shouldn't be measurable compared to the cost of the hardware
> handling, skb memory allocation, iommu mapping etc...
>
> why do you care about this specific irq-flood corner case where the load
> is lost in the noise and there's no way to make it scheduler-friendy
> either since hardware irqs are involved?
>
> the only way to make that workload scheduler friendly, is to bind the
> irq of the network card to cpu 1 and to bind the RT app to cpu0, no
> other way around it, no matter where you run the rcu callbacks (if in
> irq context, or non-irq context).

Yes, these are two different problems. One other way to handle
the DoS case is to limit the amount of time in IRQ context.

> > Of course, it would be possible to insert preemption points in
> > the kernel daemon should we choose to support realtime response
> > in absence of CONFIG_PREEMPT.
>
> 2.6 support realtime response with PREEMPT=y and =n, infact preempt
> doesn't affect the worst case RT latency at all, it can't.
>
> So you shouldn't relay on preempt to avoid explicit schedule points
> there.

It can if there is a long preemptible kernel code path. It might
be that there is currently no such code path in the kernel.
However, I agree that there are a lot of advantages in avoiding
preemption, especially in SMP kernels. It is good to be able
to rely on running on the same CPU without taking special
precautions!

> > That said, it may well be necessary to use IPIs in some cases,
> > as rcu-poll does, to accelerate RCU grace periods. For example,
> > if low on memory or if too many callbacks have accumulated.
> > If this ends up being necessary, it might get the best of both
> > worlds, since one would shorten grace periods only when there
> > are already a lot of callbacks, so that the overhead would be
> > nicely amortized. It will be interesting to see how this goes!
>
> I'm not sure in this case if IPI are needed just for offloading the
> remaining work to a rearming tasklet.

Different problem -- my thought here was to shorten the grace period
in order to stave off OOM.

> > > > In my DoS testing setup, I see that limiting RCU softirqs
> > > > and re-arming tasklets has no effect on user process starvation.
> > >
> > > in an irq flood load that stalls userspace anyways it's ok to spread the
> > > callback load into the irqs, 10 tasklets and in turn 10 callbacks per
> > > irqs or so. That load isn't scheduler friendly anyways.
> >
> > The goal is to run reasonably, even under this workload, which, as you
> > say is not scheduler friendly. Scheduler hostile, in fact. ;-)
>
> Indeed it is, and I'm simply expecting not any real difference by
> running 10 callbacks per hardware irq, so I find it a non very
> interesting workload to choose between a softirq or the kernel thread,
> but maybe I'm overlooking something.

Might be the "nice" value, which is 19 for softirq, and -19 for
kernel thread. But I am likely to be missing something here myself.

> > > the one property you need is not to be RT like eventd, to be scheduler
> > > friendly, but guaranteed to make progress too and that's what softirqs
> > > can give you and that's why I used only softirqs in my rcu_poll
> > > patches too ;).
> >
> > The problem is that some of the workloads generate thousands of
> > RCU callbacks per grace period. If we are going to provide
> > realtime scheduling latencies in the hundreds of microseconds, we
> > probably aren't going to get away with executing all of these
> > callbacks in softirq context.
>
> it should, you just need to run 1 callback per re-arming tasklet (then
> after the list is empty you stop re-arming), the softirq code will do
> the rest for you offloading it immediatly to ksoftirqd after 10
> callbacks, and ksoftirqd will reschedule explicitly once every 10
> callbacks too. The whole point of ksoftirqd is to make re-arming
> tasklets irq-friendy. Though there's a cost in offloading the work to a
> daemon, so we must not do it too frequently, so we retry 10 times before
> giving up and claiming the tasklet re-entrant.

If the "nice" value does not matter, this seems reasonable, at least for
some value of 10. ;-)

Thanx, Paul

2004-03-24 21:42:09

by Dipankar Sarma

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

On Wed, Mar 24, 2004 at 06:51:42PM +0100, Andrea Arcangeli wrote:
> On Wed, Mar 24, 2004 at 09:26:57AM -0800, Paul E. McKenney wrote:
> running 1 callback per softirq (and in turn 10 callbacks per hardware
> irq) shouldn't be measurable compared to the cost of the hardware
> handling, skb memory allocation, iommu mapping etc...
>
> why do you care about this specific irq-flood corner case where the load
> is lost in the noise and there's no way to make it scheduler-friendy
> either since hardware irqs are involved?

We are looking at two different problems. Scheduling latency and
DoS on route cache that results in dst cache overflows. The second
one is an irq-flood, but latency is the least of the problems there.

> > > > In my DoS testing setup, I see that limiting RCU softirqs
> > > > and re-arming tasklets has no effect on user process starvation.
> > >
> > > in an irq flood load that stalls userspace anyways it's ok to spread the
> > > callback load into the irqs, 10 tasklets and in turn 10 callbacks per
> > > irqs or so. That load isn't scheduler friendly anyways.
> >
> > The goal is to run reasonably, even under this workload, which, as you
> > say is not scheduler friendly. Scheduler hostile, in fact. ;-)
>
> Indeed it is, and I'm simply expecting not any real difference by
> running 10 callbacks per hardware irq, so I find it a non very
> interesting workload to choose between a softirq or the kernel thread,
> but maybe I'm overlooking something.

The difference here is that during the callbacks in the kernel thread,
I don't disable softirqs unlike ksoftirqd thus giving it more opportunity for
preemption.

> > The problem is that some of the workloads generate thousands of
> > RCU callbacks per grace period. If we are going to provide
> > realtime scheduling latencies in the hundreds of microseconds, we
> > probably aren't going to get away with executing all of these
> > callbacks in softirq context.
>
> it should, you just need to run 1 callback per re-arming tasklet (then
> after the list is empty you stop re-arming), the softirq code will do
> the rest for you offloading it immediatly to ksoftirqd after 10
> callbacks, and ksoftirqd will reschedule explicitly once every 10
> callbacks too. The whole point of ksoftirqd is to make re-arming
> tasklets irq-friendy. Though there's a cost in offloading the work to a
> daemon, so we must not do it too frequently, so we retry 10 times before
> giving up and claiming the tasklet re-entrant.

I had already been testing this for the DoS issue, but I tried it
with amlat on a 2.4 GHz (UP) P4 xeon with 256MB ram and dbench 32 in a
loop. Here are the results (CONFIG_PREEMPT = y) -

2.6.0 vanilla - 711 microseconds
2.6.0 + throttle-rcu - 439 microseconds
2.6.0 + rcu-low-lat - 413 microseconds

So under dbench workload atleast, throttling RCU works just as
good as offloading them to a kernel thread (krcud) as rcu-low-lat
does.

I used the following throttle-rcu patch with rcupdate.rcumaxbatch
set to 16 and rcupdate.rcuplugticks set to 0. That is essentially
equvalent to Andrea's earler suggestion. I have so many knobs in
the patch because I had written it earlier for other experiments.
Anyway, if throttling works as well as this in terms of latency
for other workloads as well and there isn't any OOM situations,
it is preferable to creating another per-cpu thread.

Thanks
Dipankar

Throttle rcu by forcing a limit on how many callbacks per softirq
and also implement a configurable plug.

include/linux/list.h | 21 +++++++++++++++++++++
include/linux/rcupdate.h | 7 ++++++-
kernel/rcupdate.c | 26 +++++++++++++++++++-------
kernel/sched.c | 2 ++
4 files changed, 48 insertions(+), 8 deletions(-)

diff -puN include/linux/list.h~throttle-rcu include/linux/list.h
--- linux-2.6.4-rcu/include/linux/list.h~throttle-rcu 2004-03-25 02:48:10.000000000 +0530
+++ linux-2.6.4-rcu-dipankar/include/linux/list.h 2004-03-25 02:48:10.000000000 +0530
@@ -251,6 +251,27 @@ static inline void list_splice(struct li
__list_splice(list, head);
}

+static inline void __list_splice_tail(struct list_head *list,
+ struct list_head *head)
+{
+ struct list_head *first = list->next;
+ struct list_head *last = list->prev;
+ struct list_head *at = head->prev;
+
+ first->prev = at;
+ at->next = first;
+ head->prev = last;
+ last->next = head;
+}
+
+
+static inline void list_splice_tail(struct list_head *list,
+ struct list_head *head)
+{
+ if (!list_empty(list))
+ __list_splice_tail(list, head);
+}
+
/**
* list_splice_init - join two lists and reinitialise the emptied list.
* @list: the new list to add.
diff -puN include/linux/rcupdate.h~throttle-rcu include/linux/rcupdate.h
--- linux-2.6.4-rcu/include/linux/rcupdate.h~throttle-rcu 2004-03-25 02:48:10.000000000 +0530
+++ linux-2.6.4-rcu-dipankar/include/linux/rcupdate.h 2004-03-25 02:49:17.000000000 +0530
@@ -96,6 +96,8 @@ struct rcu_data {
long batch; /* Batch # for current RCU batch */
struct list_head nxtlist;
struct list_head curlist;
+ struct list_head donelist;
+ int plugticks;
};

DECLARE_PER_CPU(struct rcu_data, rcu_data);
@@ -106,6 +108,8 @@ extern struct rcu_ctrlblk rcu_ctrlblk;
#define RCU_batch(cpu) (per_cpu(rcu_data, (cpu)).batch)
#define RCU_nxtlist(cpu) (per_cpu(rcu_data, (cpu)).nxtlist)
#define RCU_curlist(cpu) (per_cpu(rcu_data, (cpu)).curlist)
+#define RCU_donelist(cpu) (per_cpu(rcu_data, (cpu)).donelist)
+#define RCU_plugticks(cpu) (per_cpu(rcu_data, (cpu)).plugticks)

#define RCU_QSCTR_INVALID 0

@@ -115,7 +119,8 @@ static inline int rcu_pending(int cpu)
rcu_batch_before(RCU_batch(cpu), rcu_ctrlblk.curbatch)) ||
(list_empty(&RCU_curlist(cpu)) &&
!list_empty(&RCU_nxtlist(cpu))) ||
- cpu_isset(cpu, rcu_ctrlblk.rcu_cpu_mask))
+ cpu_isset(cpu, rcu_ctrlblk.rcu_cpu_mask) ||
+ (!list_empty(&RCU_donelist(cpu)) && RCU_plugticks(cpu) == 0))
return 1;
else
return 0;
diff -puN kernel/rcupdate.c~throttle-rcu kernel/rcupdate.c
--- linux-2.6.4-rcu/kernel/rcupdate.c~throttle-rcu 2004-03-25 02:48:10.000000000 +0530
+++ linux-2.6.4-rcu-dipankar/kernel/rcupdate.c 2004-03-25 02:56:11.000000000 +0530
@@ -39,6 +39,7 @@
#include <asm/atomic.h>
#include <asm/bitops.h>
#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/completion.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
@@ -54,6 +55,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_data
/* Fake initialization required by compiler */
static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
#define RCU_tasklet(cpu) (per_cpu(rcu_tasklet, cpu))
+static int rcumaxbatch = 1000000;
+static int rcuplugticks = 4;

/**
* call_rcu - Queue an RCU update request.
@@ -83,16 +86,23 @@ void fastcall call_rcu(struct rcu_head *
* Invoke the completed RCU callbacks. They are expected to be in
* a per-cpu list.
*/
-static void rcu_do_batch(struct list_head *list)
+static void rcu_do_batch(int cpu, struct list_head *list)
{
struct list_head *entry;
struct rcu_head *head;
+ int count = 0;

while (!list_empty(list)) {
entry = list->next;
list_del(entry);
head = list_entry(entry, struct rcu_head, list);
head->func(head->arg);
+ if (count >= rcumaxbatch) {
+ RCU_plugticks(cpu) = rcuplugticks;
+ if (!RCU_plugticks(cpu))
+ tasklet_hi_schedule(&RCU_tasklet(cpu));
+ break;
+ }
}
}

@@ -153,18 +163,16 @@ out_unlock:
spin_unlock(&rcu_ctrlblk.mutex);
}

-
/*
* This does the RCU processing work from tasklet context.
*/
static void rcu_process_callbacks(unsigned long unused)
{
int cpu = smp_processor_id();
- LIST_HEAD(list);

if (!list_empty(&RCU_curlist(cpu)) &&
rcu_batch_after(rcu_ctrlblk.curbatch, RCU_batch(cpu))) {
- list_splice(&RCU_curlist(cpu), &list);
+ list_splice_tail(&RCU_curlist(cpu), &RCU_donelist(cpu));
INIT_LIST_HEAD(&RCU_curlist(cpu));
}

@@ -185,8 +193,8 @@ static void rcu_process_callbacks(unsign
local_irq_enable();
}
rcu_check_quiescent_state();
- if (!list_empty(&list))
- rcu_do_batch(&list);
+ if (!list_empty(&RCU_donelist(cpu)) && !RCU_plugticks(cpu))
+ rcu_do_batch(cpu, &RCU_donelist(cpu));
}

void rcu_check_callbacks(int cpu, int user)
@@ -195,7 +203,7 @@ void rcu_check_callbacks(int cpu, int us
(idle_cpu(cpu) && !in_softirq() &&
hardirq_count() <= (1 << HARDIRQ_SHIFT)))
RCU_qsctr(cpu)++;
- tasklet_schedule(&RCU_tasklet(cpu));
+ tasklet_hi_schedule(&RCU_tasklet(cpu));
}

static void __devinit rcu_online_cpu(int cpu)
@@ -204,6 +212,7 @@ static void __devinit rcu_online_cpu(int
tasklet_init(&RCU_tasklet(cpu), rcu_process_callbacks, 0UL);
INIT_LIST_HEAD(&RCU_nxtlist(cpu));
INIT_LIST_HEAD(&RCU_curlist(cpu));
+ INIT_LIST_HEAD(&RCU_donelist(cpu));
}

static int __devinit rcu_cpu_notify(struct notifier_block *self,
@@ -237,6 +246,7 @@ void __init rcu_init(void)
(void *)(long)smp_processor_id());
/* Register notifier for non-boot CPUs */
register_cpu_notifier(&rcu_nb);
+ printk("RCU: rcumaxbatch = %d, rcuplugticks = %d\n", rcumaxbatch, rcuplugticks);
}

@@ -262,6 +272,8 @@ void synchronize_kernel(void)
wait_for_completion(&completion);
}

+module_param(rcumaxbatch, int, 0);
+module_param(rcuplugticks, int, 0);

EXPORT_SYMBOL(call_rcu);
EXPORT_SYMBOL(synchronize_kernel);
diff -puN kernel/sched.c~throttle-rcu kernel/sched.c
--- linux-2.6.4-rcu/kernel/sched.c~throttle-rcu 2004-03-25 02:48:10.000000000 +0530
+++ linux-2.6.4-rcu-dipankar/kernel/sched.c 2004-03-25 02:48:10.000000000 +0530
@@ -1486,6 +1486,8 @@ void scheduler_tick(int user_ticks, int

if (rcu_pending(cpu))
rcu_check_callbacks(cpu, user_ticks);
+ if (RCU_plugticks(cpu))
+ RCU_plugticks(cpu)--;

/* note: this timer irq context must be accounted for as well */
if (hardirq_count() - HARDIRQ_OFFSET) {

_

2004-03-24 22:52:47

by Andrea Arcangeli

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

On Thu, Mar 25, 2004 at 03:09:15AM +0530, Dipankar Sarma wrote:
> On Wed, Mar 24, 2004 at 06:51:42PM +0100, Andrea Arcangeli wrote:
> > On Wed, Mar 24, 2004 at 09:26:57AM -0800, Paul E. McKenney wrote:
> > running 1 callback per softirq (and in turn 10 callbacks per hardware
> > irq) shouldn't be measurable compared to the cost of the hardware
> > handling, skb memory allocation, iommu mapping etc...
> >
> > why do you care about this specific irq-flood corner case where the load
> > is lost in the noise and there's no way to make it scheduler-friendy
> > either since hardware irqs are involved?
>
> We are looking at two different problems. Scheduling latency and
> DoS on route cache that results in dst cache overflows. The second
> one is an irq-flood, but latency is the least of the problems there.

agreed.

> > > > > In my DoS testing setup, I see that limiting RCU softirqs
> > > > > and re-arming tasklets has no effect on user process starvation.
> > > >
> > > > in an irq flood load that stalls userspace anyways it's ok to spread the
> > > > callback load into the irqs, 10 tasklets and in turn 10 callbacks per
> > > > irqs or so. That load isn't scheduler friendly anyways.
> > >
> > > The goal is to run reasonably, even under this workload, which, as you
> > > say is not scheduler friendly. Scheduler hostile, in fact. ;-)
> >
> > Indeed it is, and I'm simply expecting not any real difference by
> > running 10 callbacks per hardware irq, so I find it a non very
> > interesting workload to choose between a softirq or the kernel thread,
> > but maybe I'm overlooking something.
>
> The difference here is that during the callbacks in the kernel thread,
> I don't disable softirqs unlike ksoftirqd thus giving it more opportunity for
> preemption.

preemption where? in irq? softirqd will preempt just fine.

Also not that while we process the callbacks in softirqd, the irqs will
stop running the callbacks, just like in your scenario. the callbacks
and the other softirqs will be processed from user context, and it won't
be different from scheduling krcud first and ksoftirqd later.

> I had already been testing this for the DoS issue, but I tried it
> with amlat on a 2.4 GHz (UP) P4 xeon with 256MB ram and dbench 32 in a
> loop. Here are the results (CONFIG_PREEMPT = y) -
>
> 2.6.0 vanilla - 711 microseconds
> 2.6.0 + throttle-rcu - 439 microseconds
> 2.6.0 + rcu-low-lat - 413 microseconds
>
> So under dbench workload atleast, throttling RCU works just as
> good as offloading them to a kernel thread (krcud) as rcu-low-lat
> does.

very nice.

> I used the following throttle-rcu patch with rcupdate.rcumaxbatch
> set to 16 and rcupdate.rcuplugticks set to 0. That is essentially
> equvalent to Andrea's earler suggestion. I have so many knobs in
> the patch because I had written it earlier for other experiments.
> Anyway, if throttling works as well as this in terms of latency
> for other workloads as well and there isn't any OOM situations,
> it is preferable to creating another per-cpu thread.

I don't think this is enough (it is enough for the above workload
though, maybe for the dcache too, but it's not generic enough), 16
callbacks per tick too low frequency, it may not keep up, that's why I
suggested to offload the work to a re-arming tasklet that will finish it
ASAP (but in a scheduler-friendly way) instead of waiting the next tick.

But I certainly agree this is a nice solution in practice (at least with
all the common usages).

thanks!

2004-03-24 23:12:53

by Dipankar Sarma

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

On Wed, Mar 24, 2004 at 11:53:26PM +0100, Andrea Arcangeli wrote:
> On Thu, Mar 25, 2004 at 03:09:15AM +0530, Dipankar Sarma wrote:
> >
> > The difference here is that during the callbacks in the kernel thread,
> > I don't disable softirqs unlike ksoftirqd thus giving it more opportunity for
> > preemption.
>
> preemption where? in irq? softirqd will preempt just fine.
>
> Also not that while we process the callbacks in softirqd, the irqs will
> stop running the callbacks, just like in your scenario. the callbacks
> and the other softirqs will be processed from user context, and it won't
> be different from scheduling krcud first and ksoftirqd later.

It is different because the heart of ksoftirqd - do_softirq() is
run with local_bh_disable(). Sure, you get chances to preempt
there, but not as much as krcud which runs completely in process
context. Of course, it depends on how long we stay with
softirqs disabled in do_softirq().

> > I had already been testing this for the DoS issue, but I tried it
> > with amlat on a 2.4 GHz (UP) P4 xeon with 256MB ram and dbench 32 in a
> > loop. Here are the results (CONFIG_PREEMPT = y) -
> >
> > 2.6.0 vanilla - 711 microseconds
> > 2.6.0 + throttle-rcu - 439 microseconds
> > 2.6.0 + rcu-low-lat - 413 microseconds
> >
> > So under dbench workload atleast, throttling RCU works just as
> > good as offloading them to a kernel thread (krcud) as rcu-low-lat
> > does.
>
> very nice.

Yes. BTW, throttle-rcu too can't avoid route cache overflow under
DoS stress test. Different problem, but I just wanted to mention
this. I will publish those results separately.

> > I used the following throttle-rcu patch with rcupdate.rcumaxbatch
> > set to 16 and rcupdate.rcuplugticks set to 0. That is essentially
> > equvalent to Andrea's earler suggestion. I have so many knobs in
> > the patch because I had written it earlier for other experiments.
> > Anyway, if throttling works as well as this in terms of latency
> > for other workloads as well and there isn't any OOM situations,
> > it is preferable to creating another per-cpu thread.
>
> I don't think this is enough (it is enough for the above workload
> though, maybe for the dcache too, but it's not generic enough), 16
> callbacks per tick too low frequency, it may not keep up, that's why I

That was not 16 callbacks per tick, it was 16 callbacks in one
batch of a single softirq. And then I reschedule the RCU tasklet
to process the rest. I am planning to vary this and see if we
should do even less per softirq.

> But I certainly agree this is a nice solution in practice (at least with
> all the common usages).

Needs more testing with other workloads. I will have some more results
with the tiny files test.

Thanks
Dipankar

2004-03-24 23:35:41

by Andrea Arcangeli

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

On Wed, Mar 24, 2004 at 12:02:08PM -0800, Paul E. McKenney wrote:
> If the "nice" value does not matter, this seems reasonable, at least for
> some value of 10. ;-)

the nice value should no matter for this.

btw, (just to avoid misunderstanding) the number 10 is
MAX_SOFTIRQ_RESTART.

2004-03-24 23:33:39

by Andrea Arcangeli

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

On Thu, Mar 25, 2004 at 04:41:45AM +0530, Dipankar Sarma wrote:
> That was not 16 callbacks per tick, it was 16 callbacks in one
> batch of a single softirq. And then I reschedule the RCU tasklet

sorry so you're already using tasklets in current code? I misunderstood
the current code then.

> to process the rest. I am planning to vary this and see if we
> should do even less per softirq.

yes, I think 16 is too much, the softirq code should just retry 10
times, summing up to 160 callbacks. After you re-arm the tasklet the
first time, all other rearmed invocations should probably execute less
callbacks than 16.

it greatly depends on the number of times we retry a softirq before
giving up and offloading the work to ksoftirqd, that number is 10
currently (see MAX_SOFTIRQ_RESTART). The bigger that number, the less
callbacks you can execute per tasklet.

2004-03-24 23:49:45

by Dipankar Sarma

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

On Thu, Mar 25, 2004 at 12:34:30AM +0100, Andrea Arcangeli wrote:
> On Thu, Mar 25, 2004 at 04:41:45AM +0530, Dipankar Sarma wrote:
> > That was not 16 callbacks per tick, it was 16 callbacks in one
> > batch of a single softirq. And then I reschedule the RCU tasklet
>
> sorry so you're already using tasklets in current code? I misunderstood
> the current code then.

+ if (count >= rcumaxbatch) {
+ RCU_plugticks(cpu) = rcuplugticks;
+ if (!RCU_plugticks(cpu))
+ tasklet_hi_schedule(&RCU_tasklet(cpu));
+ break;
+ }

That does it. Although, the tasklet handler needs to optimized
to for such frequent rescheduling when there isn't anything
else to process in that cpu. Later.

Thanks
Dipankar

2004-03-24 23:53:14

by Andrea Arcangeli

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

On Thu, Mar 25, 2004 at 05:16:43AM +0530, Dipankar Sarma wrote:
> On Thu, Mar 25, 2004 at 12:34:30AM +0100, Andrea Arcangeli wrote:
> > On Thu, Mar 25, 2004 at 04:41:45AM +0530, Dipankar Sarma wrote:
> > > That was not 16 callbacks per tick, it was 16 callbacks in one
> > > batch of a single softirq. And then I reschedule the RCU tasklet
> >
> > sorry so you're already using tasklets in current code? I misunderstood
> > the current code then.
>
> + if (count >= rcumaxbatch) {
> + RCU_plugticks(cpu) = rcuplugticks;
> + if (!RCU_plugticks(cpu))
> + tasklet_hi_schedule(&RCU_tasklet(cpu));
> + break;
> + }
>
> That does it. Although, the tasklet handler needs to optimized

yes I've noticed it reading just the above chunk of your patch, I just
didn't notice there was a tasklet already there ready for use :/.

thanks.

2004-03-25 00:48:48

by Paul E. McKenney

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

On Thu, Mar 25, 2004 at 12:36:29AM +0100, Andrea Arcangeli wrote:
> On Wed, Mar 24, 2004 at 12:02:08PM -0800, Paul E. McKenney wrote:
> > If the "nice" value does not matter, this seems reasonable, at least for
> > some value of 10. ;-)
>
> the nice value should no matter for this.

I agree that there would not likely be any differences except in
corner-case OOM situations, and that we would probably not want
to rely on such differences in any case.

> btw, (just to avoid misunderstanding) the number 10 is
> MAX_SOFTIRQ_RESTART.

Ah! Thank you for the clarification -- I thought you were
talking about the number of RCU callbacks to be executed in each
rcu_do_batch() invocation. And, yes, after MAX_SOFTIRQ_RESTART,
ksoftirqd does re-enable preemption.

Thanx, Paul

2004-03-28 16:56:56

by Takashi Iwai

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

At Thu, 25 Mar 2004 05:16:43 +0530,
Dipankar Sarma wrote:
>
> On Thu, Mar 25, 2004 at 12:34:30AM +0100, Andrea Arcangeli wrote:
> > On Thu, Mar 25, 2004 at 04:41:45AM +0530, Dipankar Sarma wrote:
> > > That was not 16 callbacks per tick, it was 16 callbacks in one
> > > batch of a single softirq. And then I reschedule the RCU tasklet
> >
> > sorry so you're already using tasklets in current code? I misunderstood
> > the current code then.
>
> + if (count >= rcumaxbatch) {
> + RCU_plugticks(cpu) = rcuplugticks;
> + if (!RCU_plugticks(cpu))
> + tasklet_hi_schedule(&RCU_tasklet(cpu));
> + break;
> + }

it seems count is never incremented in your patch...
or am i missing something?

anyway, i confirmed that with the original krcud patch the latency
with dcache flood can be eliminated.

for the non-preemptive case, rcu_bh_callback_limit() should return
bhlimit always, though. otherwise cond_resched() isn't called in the
callback loop properly.

Takashi

2004-03-28 17:21:50

by Dipankar Sarma

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

On Sun, Mar 28, 2004 at 06:53:47PM +0200, Takashi Iwai wrote:
> At Thu, 25 Mar 2004 05:16:43 +0530,
> Dipankar Sarma wrote:
> >
> > On Thu, Mar 25, 2004 at 12:34:30AM +0100, Andrea Arcangeli wrote:
> > > On Thu, Mar 25, 2004 at 04:41:45AM +0530, Dipankar Sarma wrote:
> > > > That was not 16 callbacks per tick, it was 16 callbacks in one
> > > > batch of a single softirq. And then I reschedule the RCU tasklet
> > >
> > > sorry so you're already using tasklets in current code? I misunderstood
> > > the current code then.
> >
> > + if (count >= rcumaxbatch) {
> > + RCU_plugticks(cpu) = rcuplugticks;
> > + if (!RCU_plugticks(cpu))
> > + tasklet_hi_schedule(&RCU_tasklet(cpu));
> > + break;
> > + }
>
> it seems count is never incremented in your patch...
> or am i missing something?

I messed it up when I forward ported the throttle-rcu.patch
from 2.6.0+lots-of-instrumentation to 2.6.4-vanilla in order
to publish in lkml. The original patch did this -

@@ -110,6 +113,10 @@ static void rcu_do_batch(int cpu, struct
head->func(head->arg);
RCU_nr_rcupdates(cpu)++;
count++;
+ if (count >= rcumaxbatch) {
+ RCU_plugticks(cpu) = rcuplugticks;
+ break;
+ }
}

Sorry about that.

> anyway, i confirmed that with the original krcud patch the latency
> with dcache flood can be eliminated.

Does the throttle-rcu patch also help eliminate dcache flood ? You
can try by just changing count >= rcumaxbatch to ++count > rcumaxbatch.

>
> for the non-preemptive case, rcu_bh_callback_limit() should return
> bhlimit always, though. otherwise cond_resched() isn't called in the
> callback loop properly.

Yes, I think we should consider using limiting even in the non-preemptive
case.

Thanks
Dipankar

2004-03-28 17:28:51

by Takashi Iwai

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

At Sun, 28 Mar 2004 22:50:36 +0530,
Dipankar Sarma wrote:
>
> On Sun, Mar 28, 2004 at 06:53:47PM +0200, Takashi Iwai wrote:
>
> > it seems count is never incremented in your patch...
> > or am i missing something?
>
> I messed it up when I forward ported the throttle-rcu.patch
> from 2.6.0+lots-of-instrumentation to 2.6.4-vanilla in order
> to publish in lkml. The original patch did this -

thans for the patch. i expected the similar fix :)

> > anyway, i confirmed that with the original krcud patch the latency
> > with dcache flood can be eliminated.
>
> Does the throttle-rcu patch also help eliminate dcache flood ? You
> can try by just changing count >= rcumaxbatch to ++count > rcumaxbatch.

i'll try it later.

> > for the non-preemptive case, rcu_bh_callback_limit() should return
> > bhlimit always, though. otherwise cond_resched() isn't called in the
> > callback loop properly.
>
> Yes, I think we should consider using limiting even in the non-preemptive
> case.

you mean preemptive case?

Takashi

2004-03-29 10:43:50

by Takashi Iwai

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

At Sun, 28 Mar 2004 19:28:41 +0200,
I wrote:
>
> > > anyway, i confirmed that with the original krcud patch the latency
> > > with dcache flood can be eliminated.
> >
> > Does the throttle-rcu patch also help eliminate dcache flood ? You
> > can try by just changing count >= rcumaxbatch to ++count > rcumaxbatch.
>
> i'll try it later.

the throttle-rcu patch does work indeed well even without preemption.
i've tested maxbatch=16 and plugticks=0. in the older version, there
was 20ms long latency, while in the patched version, no measurable
latency more than 1ms.

thanks for your work!

Takashi

2004-03-29 12:31:19

by Dipankar Sarma

[permalink] [raw]

Subject: Re: [PATCH] RCU for low latency (experimental)

On Mon, Mar 29, 2004 at 12:43:11PM +0200, Takashi Iwai wrote:
> At Sun, 28 Mar 2004 19:28:41 +0200,
> I wrote:
> >
> > > > anyway, i confirmed that with the original krcud patch the latency
> > > > with dcache flood can be eliminated.
> > >
> > > Does the throttle-rcu patch also help eliminate dcache flood ? You
> > > can try by just changing count >= rcumaxbatch to ++count > rcumaxbatch.
> >
> > i'll try it later.
>
> the throttle-rcu patch does work indeed well even without preemption.
> i've tested maxbatch=16 and plugticks=0. in the older version, there
> was 20ms long latency, while in the patched version, no measurable
> latency more than 1ms.

Thanks for the measurements. throttle-rcu may eventually be the way
to go, but I would wait until we have sorted out several other problems
like route cache DoS testing that we are looking at currently.

Thanks
Dipankar