2009-03-17 01:32:40

by Mathieu Desnoyers

[permalink] [raw]
Subject: cli/sti vs local_cmpxchg and local_add_return

Hi,

I am trying to get access to some non-x86 hardware to run some atomic
primitive benchmarks for a paper on LTTng I am preparing. That should be
useful to argue about performance benefit of per-cpu atomic operations
vs interrupt disabling. I would like to run the following benchmark
module on CONFIG_SMP :

- PowerPC
- MIPS
- ia64
- alpha

usage :
make
insmod test-cmpxchg-nolock.ko
insmod: error inserting 'test-cmpxchg-nolock.ko': -1 Resource temporarily unavailable
dmesg (see dmesg output)

If some of you would be kind enough to run my test module provided below
and provide the results of these tests on a recent kernel (2.6.26~2.6.29
should be good) along with their cpuinfo, I would greatly appreciate.

Here are the CAS results for various Intel-based architectures :

Architecture | Speedup | CAS | Interrupts |
| (cli + sti) / local cmpxchg | local | sync | Enable (sti) | Disable (cli)
-------------------------------------------------------------------------------------------------
Intel Pentium 4 | 5.24 | 25 | 81 | 70 | 61 |
AMD Athlon(tm)64 X2 | 4.57 | 7 | 17 | 17 | 15 |
Intel Core2 | 6.33 | 6 | 30 | 20 | 18 |
Intel Xeon E5405 | 5.25 | 8 | 24 | 20 | 22 |

The benefit expected on PowerPC, ia64 and alpha should principally come
from removed memory barriers in the local primitives.

Thanks,

Mathieu

P.S. please forgive the coding style and hackish interface. :)


/* test-cmpxchg-nolock.c
*
* Compare local cmpxchg with irq disable / enable.
*/


#include <linux/jiffies.h>
#include <linux/compiler.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/math64.h>
#include <asm/timex.h>
#include <asm/system.h>

#define NR_LOOPS 20000

int test_val;

static void do_testbaseline(void)
{
unsigned long flags;
unsigned int i;
cycles_t time1, time2, time;
u32 rem;

local_irq_save(flags);
preempt_disable();
time1 = get_cycles();
for (i = 0; i < NR_LOOPS; i++) {
asm volatile ("");
}
time2 = get_cycles();
local_irq_restore(flags);
preempt_enable();
time = time2 - time1;

printk(KERN_ALERT "test results: time for baseline\n");
printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
printk(KERN_ALERT "total time: %llu\n", time);
time = div_u64_rem(time, NR_LOOPS, &rem);
printk(KERN_ALERT "-> baseline takes %llu cycles\n", time);
printk(KERN_ALERT "test end\n");
}

static void do_test_sync_cmpxchg(void)
{
int ret;
unsigned long flags;
unsigned int i;
cycles_t time1, time2, time;
u32 rem;

local_irq_save(flags);
preempt_disable();
time1 = get_cycles();
for (i = 0; i < NR_LOOPS; i++) {
#ifdef CONFIG_X86_32
ret = sync_cmpxchg(&test_val, 0, 0);
#else
ret = cmpxchg(&test_val, 0, 0);
#endif
}
time2 = get_cycles();
local_irq_restore(flags);
preempt_enable();
time = time2 - time1;

printk(KERN_ALERT "test results: time for locked cmpxchg\n");
printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
printk(KERN_ALERT "total time: %llu\n", time);
time = div_u64_rem(time, NR_LOOPS, &rem);
printk(KERN_ALERT "-> locked cmpxchg takes %llu cycles\n", time);
printk(KERN_ALERT "test end\n");
}

static void do_test_cmpxchg(void)
{
int ret;
unsigned long flags;
unsigned int i;
cycles_t time1, time2, time;
u32 rem;

local_irq_save(flags);
preempt_disable();
time1 = get_cycles();
for (i = 0; i < NR_LOOPS; i++) {
ret = cmpxchg_local(&test_val, 0, 0);
}
time2 = get_cycles();
local_irq_restore(flags);
preempt_enable();
time = time2 - time1;

printk(KERN_ALERT "test results: time for non locked cmpxchg\n");
printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
printk(KERN_ALERT "total time: %llu\n", time);
time = div_u64_rem(time, NR_LOOPS, &rem);
printk(KERN_ALERT "-> non locked cmpxchg takes %llu cycles\n", time);
printk(KERN_ALERT "test end\n");
}
static void do_test_sync_inc(void)
{
int ret;
unsigned long flags;
unsigned int i;
cycles_t time1, time2, time;
u32 rem;
atomic_t val;

local_irq_save(flags);
preempt_disable();
time1 = get_cycles();
for (i = 0; i < NR_LOOPS; i++) {
ret = atomic_add_return(10, &val);
}
time2 = get_cycles();
local_irq_restore(flags);
preempt_enable();
time = time2 - time1;

printk(KERN_ALERT "test results: time for locked add return\n");
printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
printk(KERN_ALERT "total time: %llu\n", time);
time = div_u64_rem(time, NR_LOOPS, &rem);
printk(KERN_ALERT "-> locked add return takes %llu cycles\n", time);
printk(KERN_ALERT "test end\n");
}


static void do_test_inc(void)
{
int ret;
unsigned long flags;
unsigned int i;
cycles_t time1, time2, time;
u32 rem;
local_t loc_val;

local_irq_save(flags);
preempt_disable();
time1 = get_cycles();
for (i = 0; i < NR_LOOPS; i++) {
ret = local_add_return(10, &loc_val);
}
time2 = get_cycles();
local_irq_restore(flags);
preempt_enable();
time = time2 - time1;

printk(KERN_ALERT "test results: time for non locked add return\n");
printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
printk(KERN_ALERT "total time: %llu\n", time);
time = div_u64_rem(time, NR_LOOPS, &rem);
printk(KERN_ALERT "-> non locked add return takes %llu cycles\n", time);
printk(KERN_ALERT "test end\n");
}



/*
* This test will have a higher standard deviation due to incoming interrupts.
*/
static void do_test_enable_int(void)
{
unsigned long flags;
unsigned int i;
cycles_t time1, time2, time;
u32 rem;

local_irq_save(flags);
preempt_disable();
time1 = get_cycles();
for (i = 0; i < NR_LOOPS; i++) {
local_irq_restore(flags);
}
time2 = get_cycles();
local_irq_restore(flags);
preempt_enable();
time = time2 - time1;

printk(KERN_ALERT "test results: time for enabling interrupts (STI)\n");
printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
printk(KERN_ALERT "total time: %llu\n", time);
time = div_u64_rem(time, NR_LOOPS, &rem);
printk(KERN_ALERT "-> enabling interrupts (STI) takes %llu cycles\n",
time);
printk(KERN_ALERT "test end\n");
}

static void do_test_disable_int(void)
{
unsigned long flags, flags2;
unsigned int i;
cycles_t time1, time2, time;
u32 rem;

local_irq_save(flags);
preempt_disable();
time1 = get_cycles();
for ( i = 0; i < NR_LOOPS; i++) {
local_irq_save(flags2);
}
time2 = get_cycles();
local_irq_restore(flags);
preempt_enable();
time = time2 - time1;

printk(KERN_ALERT "test results: time for disabling interrupts (CLI)\n");
printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
printk(KERN_ALERT "total time: %llu\n", time);
time = div_u64_rem(time, NR_LOOPS, &rem);
printk(KERN_ALERT "-> disabling interrupts (CLI) takes %llu cycles\n",
time);
printk(KERN_ALERT "test end\n");
}

static void do_test_int(void)
{
unsigned long flags;
unsigned int i;
cycles_t time1, time2, time;
u32 rem;

local_irq_save(flags);
preempt_disable();
time1 = get_cycles();
for (i = 0; i < NR_LOOPS; i++) {
local_irq_restore(flags);
local_irq_save(flags);
}
time2 = get_cycles();
local_irq_restore(flags);
preempt_enable();
time = time2 - time1;

printk(KERN_ALERT "test results: time for disabling/enabling interrupts (STI/CLI)\n");
printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
printk(KERN_ALERT "total time: %llu\n", time);
time = div_u64_rem(time, NR_LOOPS, &rem);
printk(KERN_ALERT "-> enabling/disabling interrupts (STI/CLI) takes %llu cycles\n",
time);
printk(KERN_ALERT "test end\n");
}



static int ltt_test_init(void)
{
printk(KERN_ALERT "test init\n");

do_testbaseline();
do_test_sync_cmpxchg();
do_test_cmpxchg();
do_test_sync_inc();
do_test_inc();
do_test_enable_int();
do_test_disable_int();
do_test_int();
return -EAGAIN; /* Fail will directly unload the module */
}

static void ltt_test_exit(void)
{
printk(KERN_ALERT "test exit\n");
}

module_init(ltt_test_init)
module_exit(ltt_test_exit)

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Mathieu Desnoyers");
MODULE_DESCRIPTION("Cmpxchg vs int Test");



* Makefile

ifneq ($(KERNELRELEASE),)
obj-m += test-cmpxchg-nolock.o
else
KERNELDIR ?= /lib/modules/$(shell uname -r)/build
PWD := $(shell pwd)
KERNELRELEASE = $(shell cat $(KERNELDIR)/$(KBUILD_OUTPUT)/include/linux/version.h | sed -n 's/.*UTS_RELEASE.*\"\(.*\)\".*/\1/p')
ifneq ($(INSTALL_MOD_PATH),)
DEPMOD_OPT := -b $(INSTALL_MOD_PATH)
endif

default:
$(MAKE) -C $(KERNELDIR) M=$(PWD) modules

modules_install:
$(MAKE) -C $(KERNELDIR) M=$(PWD) modules_install
if [ -f $(KERNELDIR)/$(KBUILD_OUTPUT)/System.map ] ; then /sbin/depmod -ae -F $(KERNELDIR)/$(KBUILD_OUTPUT)/System.map $(DEPMOD_OPT) $(KERNELRELEASE) ; fi


clean:
$(MAKE) -C $(KERNELDIR) M=$(PWD) clean
endif


--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68


2009-03-17 03:37:29

by David Miller

[permalink] [raw]
Subject: Re: cli/sti vs local_cmpxchg and local_add_return

From: Mathieu Desnoyers <[email protected]>
Date: Mon, 16 Mar 2009 21:32:20 -0400

> If some of you would be kind enough to run my test module provided below
> and provide the results of these tests on a recent kernel (2.6.26~2.6.29
> should be good) along with their cpuinfo, I would greatly appreciate.

Here's sparc64, but cycles is always computed as zero.

Probably that's because get_cycles() on sparc64 counts system
bus clock cycles, not CPU cycles, and you the loop iteration
isn't expensive enough to get a system clock tick in.

This is a dual UltraSPARC-IIIi at 1.2 GHz

[1052598.484452] test init
[1052598.486230] test results: time for baseline
[1052598.487878] number of loops: 20000
[1052598.489485] total time: 752
[1052598.491061] -> baseline takes 0 cycles
[1052598.492649] test end
[1052598.494874] test results: time for locked cmpxchg
[1052598.496460] number of loops: 20000
[1052598.498005] total time: 7879
[1052598.499521] -> locked cmpxchg takes 0 cycles
[1052598.501060] test end
[1052598.503194] test results: time for non locked cmpxchg
[1052598.504733] number of loops: 20000
[1052598.506213] total time: 7879
[1052598.507722] -> non locked cmpxchg takes 0 cycles
[1052598.509229] test end
[1052598.511347] test results: time for locked add return
[1052598.512821] number of loops: 20000
[1052598.514265] total time: 8254
[1052598.515682] -> locked add return takes 0 cycles
[1052598.517130] test end
[1052598.519427] test results: time for non locked add return
[1052598.520850] number of loops: 20000
[1052598.522230] total time: 11259
[1052598.523561] -> non locked add return takes 0 cycles
[1052598.524939] test end
[1052598.526393] test results: time for enabling interrupts (STI)
[1052598.527767] number of loops: 20000
[1052598.529085] total time: 1877
[1052598.530373] -> enabling interrupts (STI) takes 0 cycles
[1052598.531713] test end
[1052598.533240] test results: time for disabling interrupts (CLI)
[1052598.534594] number of loops: 20000
[1052598.535892] total time: 3189
[1052598.537189] -> disabling interrupts (CLI) takes 0 cycles
[1052598.538551] test end
[1052598.540176] test results: time for disabling/enabling interrupts (STI/CLI)
[1052598.541579] number of loops: 20000
[1052598.542900] total time: 3940
[1052598.544207] -> enabling/disabling interrupts (STI/CLI) takes 0 cycles
[1052598.545595] test end

2009-03-17 04:10:30

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: cli/sti vs local_cmpxchg and local_add_return

* David Miller ([email protected]) wrote:
> From: Mathieu Desnoyers <[email protected]>
> Date: Mon, 16 Mar 2009 21:32:20 -0400
>
> > If some of you would be kind enough to run my test module provided below
> > and provide the results of these tests on a recent kernel (2.6.26~2.6.29
> > should be good) along with their cpuinfo, I would greatly appreciate.
>
> Here's sparc64, but cycles is always computed as zero.
>
> Probably that's because get_cycles() on sparc64 counts system
> bus clock cycles, not CPU cycles, and you the loop iteration
> isn't expensive enough to get a system clock tick in.
>
> This is a dual UltraSPARC-IIIi at 1.2 GHz
>

Hi David,

Thanks for running those tests. Actually, I did not expect good results
for sparc64 because the local_t primitives map to atomic_t. Looking at
sparc atomic_64.h, I notice that all atomic operations except cmpxchg
are done through function calls even when those functions only contain
few instructions. Is there any particular reason for that ? These
function calls can be quite costly. We could easily inline those.

And to "unleash" the full power of local_t, we should see if there are
variants of the atomic operations which are safe only on UP and if there
are some memory barriers currently embedded in the atomic_t ops we could
remove in a local_t version. Actually, all the
BACKOFF_SETUP/BACKOFF_SPIN is specific to SMP, and therefore the local_t
version probably does not need that because it touches specifically
per-cpu data. That could give very interesting results.

The reason why the results shows 0 cycles per loop is just because there
is less that a bus clock cycle per loop. But the total time (in bus
cycles) for the whole 20000 cycles gives us equivalent information.

Mathieu

> [1052598.484452] test init
> [1052598.486230] test results: time for baseline
> [1052598.487878] number of loops: 20000
> [1052598.489485] total time: 752
> [1052598.491061] -> baseline takes 0 cycles
> [1052598.492649] test end
> [1052598.494874] test results: time for locked cmpxchg
> [1052598.496460] number of loops: 20000
> [1052598.498005] total time: 7879
> [1052598.499521] -> locked cmpxchg takes 0 cycles
> [1052598.501060] test end
> [1052598.503194] test results: time for non locked cmpxchg
> [1052598.504733] number of loops: 20000
> [1052598.506213] total time: 7879
> [1052598.507722] -> non locked cmpxchg takes 0 cycles
> [1052598.509229] test end
> [1052598.511347] test results: time for locked add return
> [1052598.512821] number of loops: 20000
> [1052598.514265] total time: 8254
> [1052598.515682] -> locked add return takes 0 cycles
> [1052598.517130] test end
> [1052598.519427] test results: time for non locked add return
> [1052598.520850] number of loops: 20000
> [1052598.522230] total time: 11259
> [1052598.523561] -> non locked add return takes 0 cycles
> [1052598.524939] test end
> [1052598.526393] test results: time for enabling interrupts (STI)
> [1052598.527767] number of loops: 20000
> [1052598.529085] total time: 1877
> [1052598.530373] -> enabling interrupts (STI) takes 0 cycles
> [1052598.531713] test end
> [1052598.533240] test results: time for disabling interrupts (CLI)
> [1052598.534594] number of loops: 20000
> [1052598.535892] total time: 3189
> [1052598.537189] -> disabling interrupts (CLI) takes 0 cycles
> [1052598.538551] test end
> [1052598.540176] test results: time for disabling/enabling interrupts (STI/CLI)
> [1052598.541579] number of loops: 20000
> [1052598.542900] total time: 3940
> [1052598.544207] -> enabling/disabling interrupts (STI/CLI) takes 0 cycles
> [1052598.545595] test end
>

--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2009-03-17 04:27:41

by David Miller

[permalink] [raw]
Subject: Re: cli/sti vs local_cmpxchg and local_add_return

From: Mathieu Desnoyers <[email protected]>
Date: Tue, 17 Mar 2009 00:10:16 -0400

> Thanks for running those tests. Actually, I did not expect good results
> for sparc64 because the local_t primitives map to atomic_t. Looking at
> sparc atomic_64.h, I notice that all atomic operations except cmpxchg
> are done through function calls even when those functions only contain
> few instructions. Is there any particular reason for that ? These
> function calls can be quite costly. We could easily inline those.

With all the memory barriers, cpu bug workarounds, et al.
it's way too much to expand inline.

> And to "unleash" the full power of local_t, we should see if there are
> variants of the atomic operations which are safe only on UP and if there
> are some memory barriers currently embedded in the atomic_t ops we could
> remove in a local_t version. Actually, all the
> BACKOFF_SETUP/BACKOFF_SPIN is specific to SMP, and therefore the local_t
> version probably does not need that because it touches specifically
> per-cpu data. That could give very interesting results.
>
> The reason why the results shows 0 cycles per loop is just because there
> is less that a bus clock cycle per loop. But the total time (in bus
> cycles) for the whole 20000 cycles gives us equivalent information.

I don't think it's worth it. Rusty made similar tests not too long
ago.

IRQ disabling/enabling on sparc64 is 9 cycles (each) and the atomic
operation on the other hand is at least 35 cycles.

2009-03-17 04:45:20

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: cli/sti vs local_cmpxchg and local_add_return

* David Miller ([email protected]) wrote:
> From: Mathieu Desnoyers <[email protected]>
> Date: Tue, 17 Mar 2009 00:10:16 -0400
>
> > Thanks for running those tests. Actually, I did not expect good results
> > for sparc64 because the local_t primitives map to atomic_t. Looking at
> > sparc atomic_64.h, I notice that all atomic operations except cmpxchg
> > are done through function calls even when those functions only contain
> > few instructions. Is there any particular reason for that ? These
> > function calls can be quite costly. We could easily inline those.
>
> With all the memory barriers, cpu bug workarounds, et al.
> it's way too much to expand inline.
>
> > And to "unleash" the full power of local_t, we should see if there are
> > variants of the atomic operations which are safe only on UP and if there
> > are some memory barriers currently embedded in the atomic_t ops we could
> > remove in a local_t version. Actually, all the
> > BACKOFF_SETUP/BACKOFF_SPIN is specific to SMP, and therefore the local_t
> > version probably does not need that because it touches specifically
> > per-cpu data. That could give very interesting results.
> >
> > The reason why the results shows 0 cycles per loop is just because there
> > is less that a bus clock cycle per loop. But the total time (in bus
> > cycles) for the whole 20000 cycles gives us equivalent information.
>
> I don't think it's worth it. Rusty made similar tests not too long
> ago.
>
> IRQ disabling/enabling on sparc64 is 9 cycles (each) and the atomic
> operation on the other hand is at least 35 cycles.

OK, so sparc64 should probably implement local_t with interrupt
disabling on the local CPU and two atomic aligned operations (1 read, 1
write) of 64-bits variables from/to memory, so we make sure that if a
remote CPU tries to simply read the information, it is never seen as
corrupted.

Note that any code doing "remote reads" and "write expected to be read
from a remote cpu" on local_t variables must provide its own memory
barriers.

Mathieu

--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2009-03-17 05:01:50

by Paul E. McKenney

[permalink] [raw]
Subject: Re: cli/sti vs local_cmpxchg and local_add_return

On Mon, Mar 16, 2009 at 09:32:20PM -0400, Mathieu Desnoyers wrote:
> Hi,
>
> I am trying to get access to some non-x86 hardware to run some atomic
> primitive benchmarks for a paper on LTTng I am preparing. That should be
> useful to argue about performance benefit of per-cpu atomic operations
> vs interrupt disabling. I would like to run the following benchmark
> module on CONFIG_SMP :
>
> - PowerPC
> - MIPS
> - ia64
> - alpha
>
> usage :
> make
> insmod test-cmpxchg-nolock.ko
> insmod: error inserting 'test-cmpxchg-nolock.ko': -1 Resource temporarily unavailable
> dmesg (see dmesg output)

Here you are on a 4.2GHz Power box:

test init
test results: time for baseline
number of loops: 20000
total time: 12490
-> baseline takes 0 cycles
test end
test results: time for locked cmpxchg
number of loops: 20000
total time: 345748
-> locked cmpxchg takes 17 cycles
test end
test results: time for non locked cmpxchg
number of loops: 20000
total time: 198304
-> non locked cmpxchg takes 9 cycles
test end
test results: time for locked add return
number of loops: 20000
total time: 253977
-> locked add return takes 12 cycles
test end
test results: time for non locked add return
number of loops: 20000
total time: 189837
-> non locked add return takes 9 cycles
test end
test results: time for enabling interrupts (STI)
number of loops: 20000
total time: 298390
-> enabling interrupts (STI) takes 14 cycles
test end
test results: time for disabling interrupts (CLI)
number of loops: 20000
total time: 43977
-> disabling interrupts (CLI) takes 2 cycles
test end
test results: time for disabling/enabling interrupts (STI/CLI)
number of loops: 20000
total time: 298773
-> enabling/disabling interrupts (STI/CLI) takes 14 cycles
test end

Thanx, Paul


> If some of you would be kind enough to run my test module provided below
> and provide the results of these tests on a recent kernel (2.6.26~2.6.29
> should be good) along with their cpuinfo, I would greatly appreciate.
>
> Here are the CAS results for various Intel-based architectures :
>
> Architecture | Speedup | CAS | Interrupts |
> | (cli + sti) / local cmpxchg | local | sync | Enable (sti) | Disable (cli)
> -------------------------------------------------------------------------------------------------
> Intel Pentium 4 | 5.24 | 25 | 81 | 70 | 61 |
> AMD Athlon(tm)64 X2 | 4.57 | 7 | 17 | 17 | 15 |
> Intel Core2 | 6.33 | 6 | 30 | 20 | 18 |
> Intel Xeon E5405 | 5.25 | 8 | 24 | 20 | 22 |
>
> The benefit expected on PowerPC, ia64 and alpha should principally come
> from removed memory barriers in the local primitives.
>
> Thanks,
>
> Mathieu
>
> P.S. please forgive the coding style and hackish interface. :)
>
>
> /* test-cmpxchg-nolock.c
> *
> * Compare local cmpxchg with irq disable / enable.
> */
>
>
> #include <linux/jiffies.h>
> #include <linux/compiler.h>
> #include <linux/init.h>
> #include <linux/module.h>
> #include <linux/math64.h>
> #include <asm/timex.h>
> #include <asm/system.h>
>
> #define NR_LOOPS 20000
>
> int test_val;
>
> static void do_testbaseline(void)
> {
> unsigned long flags;
> unsigned int i;
> cycles_t time1, time2, time;
> u32 rem;
>
> local_irq_save(flags);
> preempt_disable();
> time1 = get_cycles();
> for (i = 0; i < NR_LOOPS; i++) {
> asm volatile ("");
> }
> time2 = get_cycles();
> local_irq_restore(flags);
> preempt_enable();
> time = time2 - time1;
>
> printk(KERN_ALERT "test results: time for baseline\n");
> printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
> printk(KERN_ALERT "total time: %llu\n", time);
> time = div_u64_rem(time, NR_LOOPS, &rem);
> printk(KERN_ALERT "-> baseline takes %llu cycles\n", time);
> printk(KERN_ALERT "test end\n");
> }
>
> static void do_test_sync_cmpxchg(void)
> {
> int ret;
> unsigned long flags;
> unsigned int i;
> cycles_t time1, time2, time;
> u32 rem;
>
> local_irq_save(flags);
> preempt_disable();
> time1 = get_cycles();
> for (i = 0; i < NR_LOOPS; i++) {
> #ifdef CONFIG_X86_32
> ret = sync_cmpxchg(&test_val, 0, 0);
> #else
> ret = cmpxchg(&test_val, 0, 0);
> #endif
> }
> time2 = get_cycles();
> local_irq_restore(flags);
> preempt_enable();
> time = time2 - time1;
>
> printk(KERN_ALERT "test results: time for locked cmpxchg\n");
> printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
> printk(KERN_ALERT "total time: %llu\n", time);
> time = div_u64_rem(time, NR_LOOPS, &rem);
> printk(KERN_ALERT "-> locked cmpxchg takes %llu cycles\n", time);
> printk(KERN_ALERT "test end\n");
> }
>
> static void do_test_cmpxchg(void)
> {
> int ret;
> unsigned long flags;
> unsigned int i;
> cycles_t time1, time2, time;
> u32 rem;
>
> local_irq_save(flags);
> preempt_disable();
> time1 = get_cycles();
> for (i = 0; i < NR_LOOPS; i++) {
> ret = cmpxchg_local(&test_val, 0, 0);
> }
> time2 = get_cycles();
> local_irq_restore(flags);
> preempt_enable();
> time = time2 - time1;
>
> printk(KERN_ALERT "test results: time for non locked cmpxchg\n");
> printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
> printk(KERN_ALERT "total time: %llu\n", time);
> time = div_u64_rem(time, NR_LOOPS, &rem);
> printk(KERN_ALERT "-> non locked cmpxchg takes %llu cycles\n", time);
> printk(KERN_ALERT "test end\n");
> }
> static void do_test_sync_inc(void)
> {
> int ret;
> unsigned long flags;
> unsigned int i;
> cycles_t time1, time2, time;
> u32 rem;
> atomic_t val;
>
> local_irq_save(flags);
> preempt_disable();
> time1 = get_cycles();
> for (i = 0; i < NR_LOOPS; i++) {
> ret = atomic_add_return(10, &val);
> }
> time2 = get_cycles();
> local_irq_restore(flags);
> preempt_enable();
> time = time2 - time1;
>
> printk(KERN_ALERT "test results: time for locked add return\n");
> printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
> printk(KERN_ALERT "total time: %llu\n", time);
> time = div_u64_rem(time, NR_LOOPS, &rem);
> printk(KERN_ALERT "-> locked add return takes %llu cycles\n", time);
> printk(KERN_ALERT "test end\n");
> }
>
>
> static void do_test_inc(void)
> {
> int ret;
> unsigned long flags;
> unsigned int i;
> cycles_t time1, time2, time;
> u32 rem;
> local_t loc_val;
>
> local_irq_save(flags);
> preempt_disable();
> time1 = get_cycles();
> for (i = 0; i < NR_LOOPS; i++) {
> ret = local_add_return(10, &loc_val);
> }
> time2 = get_cycles();
> local_irq_restore(flags);
> preempt_enable();
> time = time2 - time1;
>
> printk(KERN_ALERT "test results: time for non locked add return\n");
> printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
> printk(KERN_ALERT "total time: %llu\n", time);
> time = div_u64_rem(time, NR_LOOPS, &rem);
> printk(KERN_ALERT "-> non locked add return takes %llu cycles\n", time);
> printk(KERN_ALERT "test end\n");
> }
>
>
>
> /*
> * This test will have a higher standard deviation due to incoming interrupts.
> */
> static void do_test_enable_int(void)
> {
> unsigned long flags;
> unsigned int i;
> cycles_t time1, time2, time;
> u32 rem;
>
> local_irq_save(flags);
> preempt_disable();
> time1 = get_cycles();
> for (i = 0; i < NR_LOOPS; i++) {
> local_irq_restore(flags);
> }
> time2 = get_cycles();
> local_irq_restore(flags);
> preempt_enable();
> time = time2 - time1;
>
> printk(KERN_ALERT "test results: time for enabling interrupts (STI)\n");
> printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
> printk(KERN_ALERT "total time: %llu\n", time);
> time = div_u64_rem(time, NR_LOOPS, &rem);
> printk(KERN_ALERT "-> enabling interrupts (STI) takes %llu cycles\n",
> time);
> printk(KERN_ALERT "test end\n");
> }
>
> static void do_test_disable_int(void)
> {
> unsigned long flags, flags2;
> unsigned int i;
> cycles_t time1, time2, time;
> u32 rem;
>
> local_irq_save(flags);
> preempt_disable();
> time1 = get_cycles();
> for ( i = 0; i < NR_LOOPS; i++) {
> local_irq_save(flags2);
> }
> time2 = get_cycles();
> local_irq_restore(flags);
> preempt_enable();
> time = time2 - time1;
>
> printk(KERN_ALERT "test results: time for disabling interrupts (CLI)\n");
> printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
> printk(KERN_ALERT "total time: %llu\n", time);
> time = div_u64_rem(time, NR_LOOPS, &rem);
> printk(KERN_ALERT "-> disabling interrupts (CLI) takes %llu cycles\n",
> time);
> printk(KERN_ALERT "test end\n");
> }
>
> static void do_test_int(void)
> {
> unsigned long flags;
> unsigned int i;
> cycles_t time1, time2, time;
> u32 rem;
>
> local_irq_save(flags);
> preempt_disable();
> time1 = get_cycles();
> for (i = 0; i < NR_LOOPS; i++) {
> local_irq_restore(flags);
> local_irq_save(flags);
> }
> time2 = get_cycles();
> local_irq_restore(flags);
> preempt_enable();
> time = time2 - time1;
>
> printk(KERN_ALERT "test results: time for disabling/enabling interrupts (STI/CLI)\n");
> printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
> printk(KERN_ALERT "total time: %llu\n", time);
> time = div_u64_rem(time, NR_LOOPS, &rem);
> printk(KERN_ALERT "-> enabling/disabling interrupts (STI/CLI) takes %llu cycles\n",
> time);
> printk(KERN_ALERT "test end\n");
> }
>
>
>
> static int ltt_test_init(void)
> {
> printk(KERN_ALERT "test init\n");
>
> do_testbaseline();
> do_test_sync_cmpxchg();
> do_test_cmpxchg();
> do_test_sync_inc();
> do_test_inc();
> do_test_enable_int();
> do_test_disable_int();
> do_test_int();
> return -EAGAIN; /* Fail will directly unload the module */
> }
>
> static void ltt_test_exit(void)
> {
> printk(KERN_ALERT "test exit\n");
> }
>
> module_init(ltt_test_init)
> module_exit(ltt_test_exit)
>
> MODULE_LICENSE("GPL");
> MODULE_AUTHOR("Mathieu Desnoyers");
> MODULE_DESCRIPTION("Cmpxchg vs int Test");
>
>
>
> * Makefile
>
> ifneq ($(KERNELRELEASE),)
> obj-m += test-cmpxchg-nolock.o
> else
> KERNELDIR ?= /lib/modules/$(shell uname -r)/build
> PWD := $(shell pwd)
> KERNELRELEASE = $(shell cat $(KERNELDIR)/$(KBUILD_OUTPUT)/include/linux/version.h | sed -n 's/.*UTS_RELEASE.*\"\(.*\)\".*/\1/p')
> ifneq ($(INSTALL_MOD_PATH),)
> DEPMOD_OPT := -b $(INSTALL_MOD_PATH)
> endif
>
> default:
> $(MAKE) -C $(KERNELDIR) M=$(PWD) modules
>
> modules_install:
> $(MAKE) -C $(KERNELDIR) M=$(PWD) modules_install
> if [ -f $(KERNELDIR)/$(KBUILD_OUTPUT)/System.map ] ; then /sbin/depmod -ae -F $(KERNELDIR)/$(KBUILD_OUTPUT)/System.map $(DEPMOD_OPT) $(KERNELRELEASE) ; fi
>
>
> clean:
> $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
> endif
>
>
> --
> Mathieu Desnoyers
> OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2009-03-17 06:05:53

by Nick Piggin

[permalink] [raw]
Subject: Re: cli/sti vs local_cmpxchg and local_add_return

On Tuesday 17 March 2009 12:32:20 Mathieu Desnoyers wrote:
> Hi,
>
> I am trying to get access to some non-x86 hardware to run some atomic
> primitive benchmarks for a paper on LTTng I am preparing. That should be
> useful to argue about performance benefit of per-cpu atomic operations
> vs interrupt disabling. I would like to run the following benchmark
> module on CONFIG_SMP :
>
> - PowerPC
> - MIPS
> - ia64
> - alpha
>
> usage :
> make
> insmod test-cmpxchg-nolock.ko
> insmod: error inserting 'test-cmpxchg-nolock.ko': -1 Resource temporarily
> unavailable dmesg (see dmesg output)
>
> If some of you would be kind enough to run my test module provided below
> and provide the results of these tests on a recent kernel (2.6.26~2.6.29
> should be good) along with their cpuinfo, I would greatly appreciate.
>
> Here are the CAS results for various Intel-based architectures :
>
> Architecture | Speedup | CAS |
> Interrupts |
>
> | (cli + sti) / local cmpxchg | local | sync | Enable
> | (sti) | Disable (cli)
>
> ---------------------------------------------------------------------------
>---------------------- Intel Pentium 4 | 5.24 |
> 25 | 81 | 70 | 61 | AMD Athlon(tm)64 X2 | 4.57
> | 7 | 17 | 17 | 15 | Intel
> Core2 | 6.33 | 6 | 30 | 20
> | 18 | Intel Xeon E5405 | 5.25 | 8
> | 24 | 20 | 22 |
>
> The benefit expected on PowerPC, ia64 and alpha should principally come
> from removed memory barriers in the local primitives.

Benefit versus what? I think all of those architectures can do SMP
atomic compare exchange sequences without barriers, can't they?

2009-03-17 15:14:58

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [ltt-dev] cli/sti vs local_cmpxchg and local_add_return

* Nick Piggin ([email protected]) wrote:
> On Tuesday 17 March 2009 12:32:20 Mathieu Desnoyers wrote:
> > Hi,
> >
> > I am trying to get access to some non-x86 hardware to run some atomic
> > primitive benchmarks for a paper on LTTng I am preparing. That should be
> > useful to argue about performance benefit of per-cpu atomic operations
> > vs interrupt disabling. I would like to run the following benchmark
> > module on CONFIG_SMP :
> >
> > - PowerPC
> > - MIPS
> > - ia64
> > - alpha
> >
> > usage :
> > make
> > insmod test-cmpxchg-nolock.ko
> > insmod: error inserting 'test-cmpxchg-nolock.ko': -1 Resource temporarily
> > unavailable dmesg (see dmesg output)
> >
> > If some of you would be kind enough to run my test module provided below
> > and provide the results of these tests on a recent kernel (2.6.26~2.6.29
> > should be good) along with their cpuinfo, I would greatly appreciate.
> >
> > Here are the CAS results for various Intel-based architectures :
> >
> > Architecture | Speedup | CAS |
> > Interrupts |
> >
> > | (cli + sti) / local cmpxchg | local | sync | Enable
> > | (sti) | Disable (cli)
> >
> > ---------------------------------------------------------------------------
> >---------------------- Intel Pentium 4 | 5.24 |
> > 25 | 81 | 70 | 61 | AMD Athlon(tm)64 X2 | 4.57
> > | 7 | 17 | 17 | 15 | Intel
> > Core2 | 6.33 | 6 | 30 | 20
> > | 18 | Intel Xeon E5405 | 5.25 | 8
> > | 24 | 20 | 22 |
> >
> > The benefit expected on PowerPC, ia64 and alpha should principally come
> > from removed memory barriers in the local primitives.
>
> Benefit versus what? I think all of those architectures can do SMP
> atomic compare exchange sequences without barriers, can't they?
>

Hi Nick,

I want to compare if it is faster to use SMP cas without barriers to
perform synchronization of the tracing hot path wrt interrupts or if it
is faster to disable interrupts. These decisions will depend on the
benchmark I propose, because it is comparing the time it takes to
perform both.

Overall, the benchmarks will allow to choose between those two
simplified hotpath pseudo-codes (offset is global to the buffer,
commit_count is per-subbuffer).


* lockless :

do {
old_offset = local_read(&offset);
get_cycles();
compute needed size.
new_offset = old_offset + size;
} while (local_cmpxchg(&offset, old_offset, new_offset) != old_offset);

/*
* note : writing to buffer is done out-of-order wrt buffer slot
* physical order.
*/
write_to_buffer(offset);

/*
* Make sure the data is written in the buffer before commit count is
* incremented.
*/
smp_wmb();

/* note : incrementing the commit count is also done out-of-order */
count = local_add_return(size, &commit_count[subbuf_index]);
if (count is filling a subbuffer)
allow to wake up readers


* irq off :

(note : offset and commit count would each be written to atomically
(type unsigned long))

local_irq_save(flags);

get_cycles();
compute needed size;
offset += size;

write_to_buffer(offset);

/*
* Make sure the data is written in the buffer before commit count is
* incremented.
*/
smp_wmb();

commit_count[subbuf_index] += size;
if (count is filling a subbuffer)
allow to wake up readers

local_irq_restore(flags);


* read-side

And basically, the data reader uses its own consumed data offset
"consumed" and reads the commit count corresponding to the subbuffer it
is about to read. It has the following pseudo-code :

(note commit_count and offset read each atomically)

consumed_old = atomic_long_read(&consumed);
compute consumed_idx from consumed_old
commit_count = commit_count[consumed_idx];
(or commit_count = local_read(&commit_count[consumed_idx]) for lockless)

/*
* read commit count before reading the buffer data and write offset.
*/
smp_rmb();

write_offset = offset;
(or write_offset = local_read(&offset))

if (consumed_old and commit_count shows subbuffer not full)
return -EAGAIN;

Allow reading subbuffer.


Mathieu

>
> _______________________________________________
> ltt-dev mailing list
> [email protected]
> http://lists.casi.polymtl.ca/cgi-bin/mailman/listinfo/ltt-dev
>

--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2009-03-17 16:06:56

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: cli/sti vs local_cmpxchg and local_add_return

* Paul E. McKenney ([email protected]) wrote:
> On Mon, Mar 16, 2009 at 09:32:20PM -0400, Mathieu Desnoyers wrote:
> > Hi,
> >
> > I am trying to get access to some non-x86 hardware to run some atomic
> > primitive benchmarks for a paper on LTTng I am preparing. That should be
> > useful to argue about performance benefit of per-cpu atomic operations
> > vs interrupt disabling. I would like to run the following benchmark
> > module on CONFIG_SMP :
> >
> > - PowerPC
> > - MIPS
> > - ia64
> > - alpha
> >
> > usage :
> > make
> > insmod test-cmpxchg-nolock.ko
> > insmod: error inserting 'test-cmpxchg-nolock.ko': -1 Resource temporarily unavailable
> > dmesg (see dmesg output)
>
> Here you are on a 4.2GHz Power box:
>
> test init
> test results: time for baseline
> number of loops: 20000
> total time: 12490
> -> baseline takes 0 cycles
> test end
> test results: time for locked cmpxchg
> number of loops: 20000
> total time: 345748
> -> locked cmpxchg takes 17 cycles
> test end
> test results: time for non locked cmpxchg
> number of loops: 20000
> total time: 198304
> -> non locked cmpxchg takes 9 cycles
> test end
> test results: time for locked add return
> number of loops: 20000
> total time: 253977
> -> locked add return takes 12 cycles
> test end
> test results: time for non locked add return
> number of loops: 20000
> total time: 189837
> -> non locked add return takes 9 cycles
> test end
> test results: time for enabling interrupts (STI)
> number of loops: 20000
> total time: 298390
> -> enabling interrupts (STI) takes 14 cycles
> test end
> test results: time for disabling interrupts (CLI)
> number of loops: 20000
> total time: 43977
> -> disabling interrupts (CLI) takes 2 cycles
> test end
> test results: time for disabling/enabling interrupts (STI/CLI)
> number of loops: 20000
> total time: 298773
> -> enabling/disabling interrupts (STI/CLI) takes 14 cycles
> test end

Thanks !

So on powerpc64, we have :

local_cmpxchg + local_add_return : 9 + 9 = 18 cycles
irq off/on : ~14-16 cycles (this is without the write and increment
instructions performing the same work as the cmpxchg and add_return.
Imprecision of the measurement is probably due to pipeline effect).

But powerpc has non-maskable interrupts, so for less than 4 cycles, I
think it's better to stay with the local_t variant to be NMI-safe.

Mathieu


>
> Thanx, Paul
>
>
> > If some of you would be kind enough to run my test module provided below
> > and provide the results of these tests on a recent kernel (2.6.26~2.6.29
> > should be good) along with their cpuinfo, I would greatly appreciate.
> >
> > Here are the CAS results for various Intel-based architectures :
> >
> > Architecture | Speedup | CAS | Interrupts |
> > | (cli + sti) / local cmpxchg | local | sync | Enable (sti) | Disable (cli)
> > -------------------------------------------------------------------------------------------------
> > Intel Pentium 4 | 5.24 | 25 | 81 | 70 | 61 |
> > AMD Athlon(tm)64 X2 | 4.57 | 7 | 17 | 17 | 15 |
> > Intel Core2 | 6.33 | 6 | 30 | 20 | 18 |
> > Intel Xeon E5405 | 5.25 | 8 | 24 | 20 | 22 |
> >
> > The benefit expected on PowerPC, ia64 and alpha should principally come
> > from removed memory barriers in the local primitives.
> >
> > Thanks,
> >
> > Mathieu
> >
> > P.S. please forgive the coding style and hackish interface. :)
> >
> >
> > /* test-cmpxchg-nolock.c
> > *
> > * Compare local cmpxchg with irq disable / enable.
> > */
> >
> >
> > #include <linux/jiffies.h>
> > #include <linux/compiler.h>
> > #include <linux/init.h>
> > #include <linux/module.h>
> > #include <linux/math64.h>
> > #include <asm/timex.h>
> > #include <asm/system.h>
> >
> > #define NR_LOOPS 20000
> >
> > int test_val;
> >
> > static void do_testbaseline(void)
> > {
> > unsigned long flags;
> > unsigned int i;
> > cycles_t time1, time2, time;
> > u32 rem;
> >
> > local_irq_save(flags);
> > preempt_disable();
> > time1 = get_cycles();
> > for (i = 0; i < NR_LOOPS; i++) {
> > asm volatile ("");
> > }
> > time2 = get_cycles();
> > local_irq_restore(flags);
> > preempt_enable();
> > time = time2 - time1;
> >
> > printk(KERN_ALERT "test results: time for baseline\n");
> > printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
> > printk(KERN_ALERT "total time: %llu\n", time);
> > time = div_u64_rem(time, NR_LOOPS, &rem);
> > printk(KERN_ALERT "-> baseline takes %llu cycles\n", time);
> > printk(KERN_ALERT "test end\n");
> > }
> >
> > static void do_test_sync_cmpxchg(void)
> > {
> > int ret;
> > unsigned long flags;
> > unsigned int i;
> > cycles_t time1, time2, time;
> > u32 rem;
> >
> > local_irq_save(flags);
> > preempt_disable();
> > time1 = get_cycles();
> > for (i = 0; i < NR_LOOPS; i++) {
> > #ifdef CONFIG_X86_32
> > ret = sync_cmpxchg(&test_val, 0, 0);
> > #else
> > ret = cmpxchg(&test_val, 0, 0);
> > #endif
> > }
> > time2 = get_cycles();
> > local_irq_restore(flags);
> > preempt_enable();
> > time = time2 - time1;
> >
> > printk(KERN_ALERT "test results: time for locked cmpxchg\n");
> > printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
> > printk(KERN_ALERT "total time: %llu\n", time);
> > time = div_u64_rem(time, NR_LOOPS, &rem);
> > printk(KERN_ALERT "-> locked cmpxchg takes %llu cycles\n", time);
> > printk(KERN_ALERT "test end\n");
> > }
> >
> > static void do_test_cmpxchg(void)
> > {
> > int ret;
> > unsigned long flags;
> > unsigned int i;
> > cycles_t time1, time2, time;
> > u32 rem;
> >
> > local_irq_save(flags);
> > preempt_disable();
> > time1 = get_cycles();
> > for (i = 0; i < NR_LOOPS; i++) {
> > ret = cmpxchg_local(&test_val, 0, 0);
> > }
> > time2 = get_cycles();
> > local_irq_restore(flags);
> > preempt_enable();
> > time = time2 - time1;
> >
> > printk(KERN_ALERT "test results: time for non locked cmpxchg\n");
> > printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
> > printk(KERN_ALERT "total time: %llu\n", time);
> > time = div_u64_rem(time, NR_LOOPS, &rem);
> > printk(KERN_ALERT "-> non locked cmpxchg takes %llu cycles\n", time);
> > printk(KERN_ALERT "test end\n");
> > }
> > static void do_test_sync_inc(void)
> > {
> > int ret;
> > unsigned long flags;
> > unsigned int i;
> > cycles_t time1, time2, time;
> > u32 rem;
> > atomic_t val;
> >
> > local_irq_save(flags);
> > preempt_disable();
> > time1 = get_cycles();
> > for (i = 0; i < NR_LOOPS; i++) {
> > ret = atomic_add_return(10, &val);
> > }
> > time2 = get_cycles();
> > local_irq_restore(flags);
> > preempt_enable();
> > time = time2 - time1;
> >
> > printk(KERN_ALERT "test results: time for locked add return\n");
> > printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
> > printk(KERN_ALERT "total time: %llu\n", time);
> > time = div_u64_rem(time, NR_LOOPS, &rem);
> > printk(KERN_ALERT "-> locked add return takes %llu cycles\n", time);
> > printk(KERN_ALERT "test end\n");
> > }
> >
> >
> > static void do_test_inc(void)
> > {
> > int ret;
> > unsigned long flags;
> > unsigned int i;
> > cycles_t time1, time2, time;
> > u32 rem;
> > local_t loc_val;
> >
> > local_irq_save(flags);
> > preempt_disable();
> > time1 = get_cycles();
> > for (i = 0; i < NR_LOOPS; i++) {
> > ret = local_add_return(10, &loc_val);
> > }
> > time2 = get_cycles();
> > local_irq_restore(flags);
> > preempt_enable();
> > time = time2 - time1;
> >
> > printk(KERN_ALERT "test results: time for non locked add return\n");
> > printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
> > printk(KERN_ALERT "total time: %llu\n", time);
> > time = div_u64_rem(time, NR_LOOPS, &rem);
> > printk(KERN_ALERT "-> non locked add return takes %llu cycles\n", time);
> > printk(KERN_ALERT "test end\n");
> > }
> >
> >
> >
> > /*
> > * This test will have a higher standard deviation due to incoming interrupts.
> > */
> > static void do_test_enable_int(void)
> > {
> > unsigned long flags;
> > unsigned int i;
> > cycles_t time1, time2, time;
> > u32 rem;
> >
> > local_irq_save(flags);
> > preempt_disable();
> > time1 = get_cycles();
> > for (i = 0; i < NR_LOOPS; i++) {
> > local_irq_restore(flags);
> > }
> > time2 = get_cycles();
> > local_irq_restore(flags);
> > preempt_enable();
> > time = time2 - time1;
> >
> > printk(KERN_ALERT "test results: time for enabling interrupts (STI)\n");
> > printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
> > printk(KERN_ALERT "total time: %llu\n", time);
> > time = div_u64_rem(time, NR_LOOPS, &rem);
> > printk(KERN_ALERT "-> enabling interrupts (STI) takes %llu cycles\n",
> > time);
> > printk(KERN_ALERT "test end\n");
> > }
> >
> > static void do_test_disable_int(void)
> > {
> > unsigned long flags, flags2;
> > unsigned int i;
> > cycles_t time1, time2, time;
> > u32 rem;
> >
> > local_irq_save(flags);
> > preempt_disable();
> > time1 = get_cycles();
> > for ( i = 0; i < NR_LOOPS; i++) {
> > local_irq_save(flags2);
> > }
> > time2 = get_cycles();
> > local_irq_restore(flags);
> > preempt_enable();
> > time = time2 - time1;
> >
> > printk(KERN_ALERT "test results: time for disabling interrupts (CLI)\n");
> > printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
> > printk(KERN_ALERT "total time: %llu\n", time);
> > time = div_u64_rem(time, NR_LOOPS, &rem);
> > printk(KERN_ALERT "-> disabling interrupts (CLI) takes %llu cycles\n",
> > time);
> > printk(KERN_ALERT "test end\n");
> > }
> >
> > static void do_test_int(void)
> > {
> > unsigned long flags;
> > unsigned int i;
> > cycles_t time1, time2, time;
> > u32 rem;
> >
> > local_irq_save(flags);
> > preempt_disable();
> > time1 = get_cycles();
> > for (i = 0; i < NR_LOOPS; i++) {
> > local_irq_restore(flags);
> > local_irq_save(flags);
> > }
> > time2 = get_cycles();
> > local_irq_restore(flags);
> > preempt_enable();
> > time = time2 - time1;
> >
> > printk(KERN_ALERT "test results: time for disabling/enabling interrupts (STI/CLI)\n");
> > printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
> > printk(KERN_ALERT "total time: %llu\n", time);
> > time = div_u64_rem(time, NR_LOOPS, &rem);
> > printk(KERN_ALERT "-> enabling/disabling interrupts (STI/CLI) takes %llu cycles\n",
> > time);
> > printk(KERN_ALERT "test end\n");
> > }
> >
> >
> >
> > static int ltt_test_init(void)
> > {
> > printk(KERN_ALERT "test init\n");
> >
> > do_testbaseline();
> > do_test_sync_cmpxchg();
> > do_test_cmpxchg();
> > do_test_sync_inc();
> > do_test_inc();
> > do_test_enable_int();
> > do_test_disable_int();
> > do_test_int();
> > return -EAGAIN; /* Fail will directly unload the module */
> > }
> >
> > static void ltt_test_exit(void)
> > {
> > printk(KERN_ALERT "test exit\n");
> > }
> >
> > module_init(ltt_test_init)
> > module_exit(ltt_test_exit)
> >
> > MODULE_LICENSE("GPL");
> > MODULE_AUTHOR("Mathieu Desnoyers");
> > MODULE_DESCRIPTION("Cmpxchg vs int Test");
> >
> >
> >
> > * Makefile
> >
> > ifneq ($(KERNELRELEASE),)
> > obj-m += test-cmpxchg-nolock.o
> > else
> > KERNELDIR ?= /lib/modules/$(shell uname -r)/build
> > PWD := $(shell pwd)
> > KERNELRELEASE = $(shell cat $(KERNELDIR)/$(KBUILD_OUTPUT)/include/linux/version.h | sed -n 's/.*UTS_RELEASE.*\"\(.*\)\".*/\1/p')
> > ifneq ($(INSTALL_MOD_PATH),)
> > DEPMOD_OPT := -b $(INSTALL_MOD_PATH)
> > endif
> >
> > default:
> > $(MAKE) -C $(KERNELDIR) M=$(PWD) modules
> >
> > modules_install:
> > $(MAKE) -C $(KERNELDIR) M=$(PWD) modules_install
> > if [ -f $(KERNELDIR)/$(KBUILD_OUTPUT)/System.map ] ; then /sbin/depmod -ae -F $(KERNELDIR)/$(KBUILD_OUTPUT)/System.map $(DEPMOD_OPT) $(KERNELRELEASE) ; fi
> >
> >
> > clean:
> > $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
> > endif
> >
> >
> > --
> > Mathieu Desnoyers
> > OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2009-03-17 18:42:38

by Alan D. Brunelle

[permalink] [raw]
Subject: Re: cli/sti vs local_cmpxchg and local_add_return

Here are the results for:

processor : 31
vendor : GenuineIntel
arch : IA-64
family : 32
model : 0
model name : Dual-Core Intel(R) Itanium(R) 2 Processor 9050
revision : 7
archrev : 0
features : branchlong, 16-byte atomic ops
cpu number : 0
cpu regs : 4
cpu MHz : 1598.002
itc MHz : 400.000000
BogoMIPS : 3186.68
siblings : 2
physical id: 196865
core id : 1
thread id : 0

test init
test results: time for baseline
number of loops: 20000
total time: 5002
-> baseline takes 0 cycles
test end
test results: time for locked cmpxchg
number of loops: 20000
total time: 60083
-> locked cmpxchg takes 3 cycles
test end
test results: time for non locked cmpxchg
number of loops: 20000
total time: 60002
-> non locked cmpxchg takes 3 cycles
test end
test results: time for locked add return
number of loops: 20000
total time: 155007
-> locked add return takes 7 cycles
test end
test results: time for non locked add return
number of loops: 20000
total time: 155004
-> non locked add return takes 7 cycles
test end
test results: time for enabling interrupts (STI)
number of loops: 20000
total time: 45003
-> enabling interrupts (STI) takes 2 cycles
test end
test results: time for disabling interrupts (CLI)
number of loops: 20000
total time: 59998
-> disabling interrupts (CLI) takes 2 cycles
test end
test results: time for disabling/enabling interrupts (STI/CLI)
number of loops: 20000
total time: 107274
-> enabling/disabling interrupts (STI/CLI) takes 5 cycles
test end

2009-03-17 19:02:06

by Andika Triwidada

[permalink] [raw]
Subject: Re: cli/sti vs local_cmpxchg and local_add_return

Older IA-64

processor : 3
vendor : GenuineIntel
arch : IA-64
family : 32
model : 0
model name : Intel(r) Itanium(r) 2 Processor 1.6GHz with 18M L3 Cache
for 533MHz Platforms
revision : 7
archrev : 0
features : branchlong, 16-byte atomic ops
cpu number : 0
cpu regs : 4
cpu MHz : 1594.911
itc MHz : 399.226174
BogoMIPS : 3186.68
siblings : 2
physical id: 1
core id : 1
thread id : 0

test init
test results: time for baseline
number of loops: 20000
total time: 10003
-> baseline takes 0 cycles
test end
test results: time for locked cmpxchg
number of loops: 20000
total time: 60073
-> locked cmpxchg takes 3 cycles
test end
test results: time for non locked cmpxchg
number of loops: 20000
total time: 60014
-> non locked cmpxchg takes 3 cycles
test end
test results: time for locked add return
number of loops: 20000
total time: 150007
-> locked add return takes 7 cycles
test end
test results: time for non locked add return
number of loops: 20000
total time: 150009
-> non locked add return takes 7 cycles
test end
test results: time for enabling interrupts (STI)
number of loops: 20000
total time: 45003
-> enabling interrupts (STI) takes 2 cycles
test end
test results: time for disabling interrupts (CLI)
number of loops: 20000
total time: 59999
-> disabling interrupts (CLI) takes 2 cycles
test end
test results: time for disabling/enabling interrupts (STI/CLI)
number of loops: 20000
total time: 105001
-> enabling/disabling interrupts (STI/CLI) takes 5 cycles
test end

2009-03-17 19:28:39

by David Miller

[permalink] [raw]
Subject: Re: cli/sti vs local_cmpxchg and local_add_return

From: Mathieu Desnoyers <[email protected]>
Date: Tue, 17 Mar 2009 12:06:35 -0400

> But powerpc has non-maskable interrupts, so for less than 4 cycles, I
> think it's better to stay with the local_t variant to be NMI-safe.

Sparc64 has non-maskable interrupts too btw.

2009-03-17 19:35:49

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: cli/sti vs local_cmpxchg and local_add_return

* David Miller ([email protected]) wrote:
> From: Mathieu Desnoyers <[email protected]>
> Date: Tue, 17 Mar 2009 12:06:35 -0400
>
> > But powerpc has non-maskable interrupts, so for less than 4 cycles, I
> > think it's better to stay with the local_t variant to be NMI-safe.
>
> Sparc64 has non-maskable interrupts too btw.
>

OK, this alone is a strong argument to favor NMI-safe lockless algorithm
even if it is a bit slower. We should see what kind of performance
impact it has on overall tracing performance to find out if it will make
a measurable difference anyway.

Mathieu

> _______________________________________________
> ltt-dev mailing list
> [email protected]
> http://lists.casi.polymtl.ca/cgi-bin/mailman/listinfo/ltt-dev
>

--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2009-03-18 11:43:55

by Nick Piggin

[permalink] [raw]
Subject: Re: [ltt-dev] cli/sti vs local_cmpxchg and local_add_return

On Wednesday 18 March 2009 02:14:37 Mathieu Desnoyers wrote:
> * Nick Piggin ([email protected]) wrote:
> > On Tuesday 17 March 2009 12:32:20 Mathieu Desnoyers wrote:
> > > Hi,
> > >
> > > I am trying to get access to some non-x86 hardware to run some atomic
> > > primitive benchmarks for a paper on LTTng I am preparing. That should
> > > be useful to argue about performance benefit of per-cpu atomic
> > > operations vs interrupt disabling. I would like to run the following
> > > benchmark module on CONFIG_SMP :
> > >
> > > - PowerPC
> > > - MIPS
> > > - ia64
> > > - alpha
> > >
> > > usage :
> > > make
> > > insmod test-cmpxchg-nolock.ko
> > > insmod: error inserting 'test-cmpxchg-nolock.ko': -1 Resource
> > > temporarily unavailable dmesg (see dmesg output)
> > >
> > > If some of you would be kind enough to run my test module provided
> > > below and provide the results of these tests on a recent kernel
> > > (2.6.26~2.6.29 should be good) along with their cpuinfo, I would
> > > greatly appreciate.
> > >
> > > Here are the CAS results for various Intel-based architectures :
> > >
> > > Architecture | Speedup | CAS |
> > > Interrupts |
> > >
> > > | (cli + sti) / local cmpxchg | local | sync |
> > > | Enable (sti) | Disable (cli)
> > >
> > > -----------------------------------------------------------------------
> > >---- ---------------------- Intel Pentium 4 | 5.24
> > > | 25 | 81 | 70 | 61 | AMD Athlon(tm)64 X2
> > > | 4.57
> > >
> > > | 7 | 17 | 17 | 15 | Intel
> > >
> > > Core2 | 6.33 | 6 | 30 | 20
> > >
> > > | 18 | Intel Xeon E5405 | 5.25 |
> > > | 8 24 | 20 | 22 |
> > >
> > > The benefit expected on PowerPC, ia64 and alpha should principally come
> > > from removed memory barriers in the local primitives.
> >
> > Benefit versus what? I think all of those architectures can do SMP
> > atomic compare exchange sequences without barriers, can't they?
>
> Hi Nick,
>
> I want to compare if it is faster to use SMP cas without barriers to
> perform synchronization of the tracing hot path wrt interrupts or if it
> is faster to disable interrupts. These decisions will depend on the
> benchmark I propose, because it is comparing the time it takes to
> perform both.
>
> Overall, the benchmarks will allow to choose between those two
> simplified hotpath pseudo-codes (offset is global to the buffer,
> commit_count is per-subbuffer).
>
>
> * lockless :
>
> do {
> old_offset = local_read(&offset);
> get_cycles();
> compute needed size.
> new_offset = old_offset + size;
> } while (local_cmpxchg(&offset, old_offset, new_offset) != old_offset);
>
> /*
> * note : writing to buffer is done out-of-order wrt buffer slot
> * physical order.
> */
> write_to_buffer(offset);
>
> /*
> * Make sure the data is written in the buffer before commit count is
> * incremented.
> */
> smp_wmb();
>
> /* note : incrementing the commit count is also done out-of-order */
> count = local_add_return(size, &commit_count[subbuf_index]);
> if (count is filling a subbuffer)
> allow to wake up readers

Ah OK, so you just mean the benefit of using local atomics is avoiding
the barriers that you get with atomic_t.

I'd thought you were referring to some benefit over irq disable pattern.


> * irq off :
>
> (note : offset and commit count would each be written to atomically
> (type unsigned long))
>
> local_irq_save(flags);
>
> get_cycles();
> compute needed size;
> offset += size;
>
> write_to_buffer(offset);
>
> /*
> * Make sure the data is written in the buffer before commit count is
> * incremented.
> */
> smp_wmb();
>
> commit_count[subbuf_index] += size;
> if (count is filling a subbuffer)
> allow to wake up readers
>
> local_irq_restore(flags);

2009-03-18 11:57:11

by Josh Boyer

[permalink] [raw]
Subject: Re: cli/sti vs local_cmpxchg and local_add_return

On Mon, Mar 16, 2009 at 09:32:20PM -0400, Mathieu Desnoyers wrote:
>Hi,
>
>I am trying to get access to some non-x86 hardware to run some atomic
>primitive benchmarks for a paper on LTTng I am preparing. That should be
>useful to argue about performance benefit of per-cpu atomic operations
>vs interrupt disabling. I would like to run the following benchmark
>module on CONFIG_SMP :
>
>- PowerPC
>- MIPS
>- ia64
>- alpha
>
>usage :
>make
>insmod test-cmpxchg-nolock.ko
>insmod: error inserting 'test-cmpxchg-nolock.ko': -1 Resource temporarily unavailable
>dmesg (see dmesg output)
>
>If some of you would be kind enough to run my test module provided below
>and provide the results of these tests on a recent kernel (2.6.26~2.6.29
>should be good) along with their cpuinfo, I would greatly appreciate.
>
>Here are the CAS results for various Intel-based architectures :
>
>Architecture | Speedup | CAS | Interrupts |
> | (cli + sti) / local cmpxchg | local | sync | Enable (sti) | Disable (cli)
>-------------------------------------------------------------------------------------------------
>Intel Pentium 4 | 5.24 | 25 | 81 | 70 | 61 |
>AMD Athlon(tm)64 X2 | 4.57 | 7 | 17 | 17 | 15 |
>Intel Core2 | 6.33 | 6 | 30 | 20 | 18 |
>Intel Xeon E5405 | 5.25 | 8 | 24 | 20 | 22 |


I know you have results from a POWER6 machine already, but
here are the results on a dual-G5 running 2.6.29-rc7-git4.

If you are interested, I could get you results from running
this on an embedded PowerPC board.

josh

test init
test results: time for baseline
number of loops: 20000
total time: 1532
-> baseline takes 0 cycles
test end
test results: time for locked cmpxchg
number of loops: 20000
total time: 48052
-> locked cmpxchg takes 2 cycles
test end
test results: time for non locked cmpxchg
number of loops: 20000
total time: 29141
-> non locked cmpxchg takes 1 cycles
test end
test results: time for locked add return
number of loops: 20000
total time: 44985
-> locked add return takes 2 cycles
test end
test results: time for non locked add return
number of loops: 20000
total time: 32400
-> non locked add return takes 1 cycles
test end
test results: time for enabling interrupts (STI)
number of loops: 20000
total time: 65579
-> enabling interrupts (STI) takes 3 cycles
test end
test results: time for disabling interrupts (CLI)
number of loops: 20000
total time: 29135
-> disabling interrupts (CLI) takes 1 cycles
test end
test results: time for disabling/enabling interrupts (STI/CLI)
number of loops: 20000
total time: 173594
-> enabling/disabling interrupts (STI/CLI) takes 8 cycles
test end
[jwboyer@localhost ~]$

2009-03-18 15:10:44

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [ltt-dev] cli/sti vs local_cmpxchg and local_add_return

* Nick Piggin ([email protected]) wrote:
> On Wednesday 18 March 2009 02:14:37 Mathieu Desnoyers wrote:
> > * Nick Piggin ([email protected]) wrote:
> > > On Tuesday 17 March 2009 12:32:20 Mathieu Desnoyers wrote:
> > > > Hi,
> > > >
> > > > I am trying to get access to some non-x86 hardware to run some atomic
> > > > primitive benchmarks for a paper on LTTng I am preparing. That should
> > > > be useful to argue about performance benefit of per-cpu atomic
> > > > operations vs interrupt disabling. I would like to run the following
> > > > benchmark module on CONFIG_SMP :
> > > >
> > > > - PowerPC
> > > > - MIPS
> > > > - ia64
> > > > - alpha
> > > >
> > > > usage :
> > > > make
> > > > insmod test-cmpxchg-nolock.ko
> > > > insmod: error inserting 'test-cmpxchg-nolock.ko': -1 Resource
> > > > temporarily unavailable dmesg (see dmesg output)
> > > >
> > > > If some of you would be kind enough to run my test module provided
> > > > below and provide the results of these tests on a recent kernel
> > > > (2.6.26~2.6.29 should be good) along with their cpuinfo, I would
> > > > greatly appreciate.
> > > >
> > > > Here are the CAS results for various Intel-based architectures :
> > > >
> > > > Architecture | Speedup | CAS |
> > > > Interrupts |
> > > >
> > > > | (cli + sti) / local cmpxchg | local | sync |
> > > > | Enable (sti) | Disable (cli)
> > > >
> > > > -----------------------------------------------------------------------
> > > >---- ---------------------- Intel Pentium 4 | 5.24
> > > > | 25 | 81 | 70 | 61 | AMD Athlon(tm)64 X2
> > > > | 4.57
> > > >
> > > > | 7 | 17 | 17 | 15 | Intel
> > > >
> > > > Core2 | 6.33 | 6 | 30 | 20
> > > >
> > > > | 18 | Intel Xeon E5405 | 5.25 |
> > > > | 8 24 | 20 | 22 |
> > > >
> > > > The benefit expected on PowerPC, ia64 and alpha should principally come
> > > > from removed memory barriers in the local primitives.
> > >
> > > Benefit versus what? I think all of those architectures can do SMP
> > > atomic compare exchange sequences without barriers, can't they?
> >
> > Hi Nick,
> >
> > I want to compare if it is faster to use SMP cas without barriers to
> > perform synchronization of the tracing hot path wrt interrupts or if it
> > is faster to disable interrupts. These decisions will depend on the
> > benchmark I propose, because it is comparing the time it takes to
> > perform both.
> >
> > Overall, the benchmarks will allow to choose between those two
> > simplified hotpath pseudo-codes (offset is global to the buffer,
> > commit_count is per-subbuffer).
> >
> >
> > * lockless :
> >
> > do {
> > old_offset = local_read(&offset);
> > get_cycles();
> > compute needed size.
> > new_offset = old_offset + size;
> > } while (local_cmpxchg(&offset, old_offset, new_offset) != old_offset);
> >
> > /*
> > * note : writing to buffer is done out-of-order wrt buffer slot
> > * physical order.
> > */
> > write_to_buffer(offset);
> >
> > /*
> > * Make sure the data is written in the buffer before commit count is
> > * incremented.
> > */
> > smp_wmb();
> >
> > /* note : incrementing the commit count is also done out-of-order */
> > count = local_add_return(size, &commit_count[subbuf_index]);
> > if (count is filling a subbuffer)
> > allow to wake up readers
>
> Ah OK, so you just mean the benefit of using local atomics is avoiding
> the barriers that you get with atomic_t.
>
> I'd thought you were referring to some benefit over irq disable pattern.
>

On powerpc and mips, for instance, yes the gain is just the disabled
barriers. On x86 it becomes more interesting because we can remove the
lock; prefix, which gives a good speedup. All I want to do here is to
figure out which of barrier-less local_t ops vs disabling interrupts is
faster (and how much faster/slower) on various architectures.

For instance, on architecture like the powerpc64 (tests provided by Paul
McKenney), it's only a difference of less than 4 cycles between irq
off/irq (14-16 cycles, and this is without doing the data access) and
doing both local_cmpxchg and local_add_return (18 cycles). So given we
might have tracepoints called from NMI context, the tiny performance
impact we have with local_t ops does not counter balance the benefit of
having a lockless NMI-safe trace buffer management algorithm.

Thanks,

Mathieu

>
> > * irq off :
> >
> > (note : offset and commit count would each be written to atomically
> > (type unsigned long))
> >
> > local_irq_save(flags);
> >
> > get_cycles();
> > compute needed size;
> > offset += size;
> >
> > write_to_buffer(offset);
> >
> > /*
> > * Make sure the data is written in the buffer before commit count is
> > * incremented.
> > */
> > smp_wmb();
> >
> > commit_count[subbuf_index] += size;
> > if (count is filling a subbuffer)
> > allow to wake up readers
> >
> > local_irq_restore(flags);
>

--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2009-03-23 16:50:28

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: cli/sti vs local_cmpxchg and local_add_return

* Alan D. Brunelle ([email protected]) wrote:
> Here are the results for:
>
> processor : 31
> vendor : GenuineIntel
> arch : IA-64
> family : 32
> model : 0
> model name : Dual-Core Intel(R) Itanium(R) 2 Processor 9050
> revision : 7
> archrev : 0
> features : branchlong, 16-byte atomic ops
> cpu number : 0
> cpu regs : 4
> cpu MHz : 1598.002
> itc MHz : 400.000000
> BogoMIPS : 3186.68
> siblings : 2
> physical id: 196865
> core id : 1
> thread id : 0
>
> test init
> test results: time for baseline
> number of loops: 20000
> total time: 5002
> -> baseline takes 0 cycles
> test end
> test results: time for locked cmpxchg
> number of loops: 20000
> total time: 60083
> -> locked cmpxchg takes 3 cycles
> test end
> test results: time for non locked cmpxchg
> number of loops: 20000
> total time: 60002
> -> non locked cmpxchg takes 3 cycles
> test end
> test results: time for locked add return
> number of loops: 20000
> total time: 155007
> -> locked add return takes 7 cycles
> test end
> test results: time for non locked add return
> number of loops: 20000
> total time: 155004
> -> non locked add return takes 7 cycles
> test end
> test results: time for enabling interrupts (STI)
> number of loops: 20000
> total time: 45003
> -> enabling interrupts (STI) takes 2 cycles
> test end
> test results: time for disabling interrupts (CLI)
> number of loops: 20000
> total time: 59998
> -> disabling interrupts (CLI) takes 2 cycles
> test end
> test results: time for disabling/enabling interrupts (STI/CLI)
> number of loops: 20000
> total time: 107274
> -> enabling/disabling interrupts (STI/CLI) takes 5 cycles
> test end

Hi Alan,

Wow, disabling interrupts is incredibly cheap on the ia64, and
local_add_return especially costly. I think it's because it is done by
an underlying cmpxchg, and therefore not supported directly by the
architecture (except for the fetch add which is limited to very specific
values).

Given some ia64 code refers to NMIs, I guess this architecture supports
them. So in the end, the decision between speed and atomicity will
depend on a solidness vs speed tradeoff. But given the time it takes to
write data to memory, I think 5 cycles vs 10 cycles won't make a big
difference overall.

Thanks for those results !

Mathieu

>

--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2009-03-23 16:56:47

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: cli/sti vs local_cmpxchg and local_add_return

* Josh Boyer ([email protected]) wrote:
> On Mon, Mar 16, 2009 at 09:32:20PM -0400, Mathieu Desnoyers wrote:
> >Hi,
> >
> >I am trying to get access to some non-x86 hardware to run some atomic
> >primitive benchmarks for a paper on LTTng I am preparing. That should be
> >useful to argue about performance benefit of per-cpu atomic operations
> >vs interrupt disabling. I would like to run the following benchmark
> >module on CONFIG_SMP :
> >
> >- PowerPC
> >- MIPS
> >- ia64
> >- alpha
> >
> >usage :
> >make
> >insmod test-cmpxchg-nolock.ko
> >insmod: error inserting 'test-cmpxchg-nolock.ko': -1 Resource temporarily unavailable
> >dmesg (see dmesg output)
> >
> >If some of you would be kind enough to run my test module provided below
> >and provide the results of these tests on a recent kernel (2.6.26~2.6.29
> >should be good) along with their cpuinfo, I would greatly appreciate.
> >
> >Here are the CAS results for various Intel-based architectures :
> >
> >Architecture | Speedup | CAS | Interrupts |
> > | (cli + sti) / local cmpxchg | local | sync | Enable (sti) | Disable (cli)
> >-------------------------------------------------------------------------------------------------
> >Intel Pentium 4 | 5.24 | 25 | 81 | 70 | 61 |
> >AMD Athlon(tm)64 X2 | 4.57 | 7 | 17 | 17 | 15 |
> >Intel Core2 | 6.33 | 6 | 30 | 20 | 18 |
> >Intel Xeon E5405 | 5.25 | 8 | 24 | 20 | 22 |
>
>
> I know you have results from a POWER6 machine already, but
> here are the results on a dual-G5 running 2.6.29-rc7-git4.
>
> If you are interested, I could get you results from running
> this on an embedded PowerPC board.
>

Thanks for the results. Well, those already shows that the tradeoff is
different between POWER6 and POWER5, so I guess further powerpc numbers
won't be required.

Mathieu

> josh
>
> test init
> test results: time for baseline
> number of loops: 20000
> total time: 1532
> -> baseline takes 0 cycles
> test end
> test results: time for locked cmpxchg
> number of loops: 20000
> total time: 48052
> -> locked cmpxchg takes 2 cycles
> test end
> test results: time for non locked cmpxchg
> number of loops: 20000
> total time: 29141
> -> non locked cmpxchg takes 1 cycles
> test end
> test results: time for locked add return
> number of loops: 20000
> total time: 44985
> -> locked add return takes 2 cycles
> test end
> test results: time for non locked add return
> number of loops: 20000
> total time: 32400
> -> non locked add return takes 1 cycles
> test end
> test results: time for enabling interrupts (STI)
> number of loops: 20000
> total time: 65579
> -> enabling interrupts (STI) takes 3 cycles
> test end
> test results: time for disabling interrupts (CLI)
> number of loops: 20000
> total time: 29135
> -> disabling interrupts (CLI) takes 1 cycles
> test end
> test results: time for disabling/enabling interrupts (STI/CLI)
> number of loops: 20000
> total time: 173594
> -> enabling/disabling interrupts (STI/CLI) takes 8 cycles
> test end
> [jwboyer@localhost ~]$

--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

2009-03-23 17:05:20

by Josh Boyer

[permalink] [raw]
Subject: Re: cli/sti vs local_cmpxchg and local_add_return

On Mon, Mar 23, 2009 at 12:56:32PM -0400, Mathieu Desnoyers wrote:
>* Josh Boyer ([email protected]) wrote:
>> On Mon, Mar 16, 2009 at 09:32:20PM -0400, Mathieu Desnoyers wrote:
>> >Hi,
>> >
>> >I am trying to get access to some non-x86 hardware to run some atomic
>> >primitive benchmarks for a paper on LTTng I am preparing. That should be
>> >useful to argue about performance benefit of per-cpu atomic operations
>> >vs interrupt disabling. I would like to run the following benchmark
>> >module on CONFIG_SMP :
>> >
>> >- PowerPC
>> >- MIPS
>> >- ia64
>> >- alpha
>> >
>> >usage :
>> >make
>> >insmod test-cmpxchg-nolock.ko
>> >insmod: error inserting 'test-cmpxchg-nolock.ko': -1 Resource temporarily unavailable
>> >dmesg (see dmesg output)
>> >
>> >If some of you would be kind enough to run my test module provided below
>> >and provide the results of these tests on a recent kernel (2.6.26~2.6.29
>> >should be good) along with their cpuinfo, I would greatly appreciate.
>> >
>> >Here are the CAS results for various Intel-based architectures :
>> >
>> >Architecture | Speedup | CAS | Interrupts |
>> > | (cli + sti) / local cmpxchg | local | sync | Enable (sti) | Disable (cli)
>> >-------------------------------------------------------------------------------------------------
>> >Intel Pentium 4 | 5.24 | 25 | 81 | 70 | 61 |
>> >AMD Athlon(tm)64 X2 | 4.57 | 7 | 17 | 17 | 15 |
>> >Intel Core2 | 6.33 | 6 | 30 | 20 | 18 |
>> >Intel Xeon E5405 | 5.25 | 8 | 24 | 20 | 22 |
>>
>>
>> I know you have results from a POWER6 machine already, but
>> here are the results on a dual-G5 running 2.6.29-rc7-git4.
>>
>> If you are interested, I could get you results from running
>> this on an embedded PowerPC board.
>>
>
>Thanks for the results. Well, those already shows that the tradeoff is
>different between POWER6 and POWER5, so I guess further powerpc numbers
>won't be required.

Correction, a dual-G5 is a PowerPC 970 machine. It's closer to POWER4
than POWER5 and nothing like POWER6. The Apple G5 machines are about
2 generations old in terms of 64-bit PowerPC CPUs.

josh