LinuxLists.cc - [PATCH] x86: eliminate redundant/contradicting cache line size config options

2009-11-13 11:54:42

Subject: [PATCH] x86: eliminate redundant/contradicting cache line size config options

Rather than having X86_L1_CACHE_BYTES and X86_L1_CACHE_SHIFT (with
inconsistent defaults), just having the latter suffices as the former
can be easily calculated from it.

To be consistent, also change X86_INTERNODE_CACHE_BYTES to
X86_INTERNODE_CACHE_SHIFT, and set it to 7 (128 bytes) for NUMA to
account for last level cache line size (which here matters more than
L1 cache line size).

Finally, make sure the default value for X86_L1_CACHE_SHIFT, when
X86_GENERIC is selected, is being seen before that for the individual
CPU model options (other than on x86-64, where GENERIC_CPU is part of
the choice construct, X86_GENERIC is a separate option on ix86).

Signed-off-by: Jan Beulich <[email protected]>
Cc: Nick Piggin <[email protected]>

---
arch/x86/Kconfig.cpu | 14 +++++---------
arch/x86/boot/compressed/vmlinux.lds.S | 3 ++-
arch/x86/include/asm/cache.h | 7 ++++---
arch/x86/kernel/vmlinux.lds.S | 10 +++++-----
arch/x86/mm/tlb.c | 3 ++-
5 files changed, 18 insertions(+), 19 deletions(-)

--- linux-2.6.32-rc7/arch/x86/Kconfig.cpu 2009-11-13 12:38:01.000000000 +0100
+++ 2.6.32-rc7-x86-cache-config/arch/x86/Kconfig.cpu 2009-11-09 15:55:41.000000000 +0100
@@ -301,15 +301,11 @@ config X86_CPU

#
# Define implied options from the CPU selection here
-config X86_L1_CACHE_BYTES
+config X86_INTERNODE_CACHE_SHIFT
int
- default "128" if MPSC
- default "64" if GENERIC_CPU || MK8 || MCORE2 || MATOM || X86_32
-
-config X86_INTERNODE_CACHE_BYTES
- int
- default "4096" if X86_VSMP
- default X86_L1_CACHE_BYTES if !X86_VSMP
+ default "12" if X86_VSMP
+ default "7" if NUMA
+ default X86_L1_CACHE_SHIFT

config X86_CMPXCHG
def_bool X86_64 || (X86_32 && !M386)
@@ -317,9 +313,9 @@ config X86_CMPXCHG
config X86_L1_CACHE_SHIFT
int
default "7" if MPENTIUM4 || MPSC
+ default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU

config X86_XADD
def_bool y
--- linux-2.6.32-rc7/arch/x86/boot/compressed/vmlinux.lds.S 2009-11-13 12:38:01.000000000 +0100
+++ 2.6.32-rc7-x86-cache-config/arch/x86/boot/compressed/vmlinux.lds.S 2009-11-09 15:52:47.000000000 +0100
@@ -4,6 +4,7 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONF

#undef i386

+#include <asm/cache.h>
#include <asm/page_types.h>

#ifdef CONFIG_X86_64
@@ -46,7 +47,7 @@ SECTIONS
*(.data.*)
_edata = . ;
}
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ . = ALIGN(L1_CACHE_BYTES);
.bss : {
_bss = . ;
*(.bss)
--- linux-2.6.32-rc7/arch/x86/include/asm/cache.h 2009-11-13 12:38:01.000000000 +0100
+++ 2.6.32-rc7-x86-cache-config/arch/x86/include/asm/cache.h 2009-11-09 15:50:22.000000000 +0100
@@ -9,12 +9,13 @@

#define __read_mostly __attribute__((__section__(".data.read_mostly")))

+#define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT
+#define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT)
+
#ifdef CONFIG_X86_VSMP
-/* vSMP Internode cacheline shift */
-#define INTERNODE_CACHE_SHIFT (12)
#ifdef CONFIG_SMP
#define __cacheline_aligned_in_smp \
- __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \
+ __attribute__((__aligned__(INTERNODE_CACHE_BYTES))) \
__page_aligned_data
#endif
#endif
--- linux-2.6.32-rc7/arch/x86/kernel/vmlinux.lds.S 2009-11-13 12:38:02.000000000 +0100
+++ 2.6.32-rc7-x86-cache-config/arch/x86/kernel/vmlinux.lds.S 2009-11-09 15:52:23.000000000 +0100
@@ -107,13 +107,13 @@ SECTIONS

PAGE_ALIGNED_DATA(PAGE_SIZE)

- CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES)
+ CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)

DATA_DATA
CONSTRUCTORS

/* rarely changed data like cpu maps */
- READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES)
+ READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)

/* End of data section */
_edata = .;
@@ -137,12 +137,12 @@ SECTIONS
*(.vsyscall_0)
} :user

- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ . = ALIGN(L1_CACHE_BYTES);
.vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
*(.vsyscall_fn)
}

- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ . = ALIGN(L1_CACHE_BYTES);
.vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
*(.vsyscall_gtod_data)
}
@@ -166,7 +166,7 @@ SECTIONS
}
vgetcpu_mode = VVIRT(.vgetcpu_mode);

- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ . = ALIGN(L1_CACHE_BYTES);
.jiffies : AT(VLOAD(.jiffies)) {
*(.jiffies)
}
--- linux-2.6.32-rc7/arch/x86/mm/tlb.c 2009-11-13 12:38:02.000000000 +0100
+++ 2.6.32-rc7-x86-cache-config/arch/x86/mm/tlb.c 2009-11-09 15:57:23.000000000 +0100
@@ -8,6 +8,7 @@

#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
+#include <asm/cache.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>

@@ -43,7 +44,7 @@ union smp_flush_state {
spinlock_t tlbstate_lock;
DECLARE_BITMAP(flush_cpumask, NR_CPUS);
};
- char pad[CONFIG_X86_INTERNODE_CACHE_BYTES];
+ char pad[INTERNODE_CACHE_BYTES];
} ____cacheline_internodealigned_in_smp;

/* State is put into the per CPU data section, but padded

2009-11-16 04:14:05

by Nick Piggin

[permalink] [raw]

Subject: Re: [PATCH] x86: eliminate redundant/contradicting cache line size config options

On Fri, Nov 13, 2009 at 11:54:40AM +0000, Jan Beulich wrote:
> Rather than having X86_L1_CACHE_BYTES and X86_L1_CACHE_SHIFT (with
> inconsistent defaults), just having the latter suffices as the former
> can be easily calculated from it.
>
> To be consistent, also change X86_INTERNODE_CACHE_BYTES to
> X86_INTERNODE_CACHE_SHIFT, and set it to 7 (128 bytes) for NUMA to
> account for last level cache line size (which here matters more than
> L1 cache line size).

I think if we're going to set it to 7 (128B, for Pentium 4), then
we should set the L1 cache shift as well? Most alignments to
prevent cacheline pingpong use L1 cache shift for this anyway?

The internode thing is really just a not quite well defined thing
because internode cachelines are really expensive and really big
on vsmp so they warrant trading off extra space on some critical
structures to reduce pingpong (but this is not to say that other
structures that are *not* internode annotated do *not* need to
worry about pingpong).

Put another way, big SMP P4 systems with !NUMA still want to have
128 byte aligned variables. I think it should just default to
L1_CACHE_SHIFT unless the special case of vsmp.

But other than that complaint (which was not introduced by your
patch anyway) then this looks like a good cleanup to me, thanks.

> Finally, make sure the default value for X86_L1_CACHE_SHIFT, when
> X86_GENERIC is selected, is being seen before that for the individual
> CPU model options (other than on x86-64, where GENERIC_CPU is part of
> the choice construct, X86_GENERIC is a separate option on ix86).
>
> Signed-off-by: Jan Beulich <[email protected]>
> Cc: Nick Piggin <[email protected]>

2009-11-16 08:08:07

by Jan Beulich

[permalink] [raw]

Subject: Re: [PATCH] x86: eliminate redundant/contradicting cache line size config options

>>> Nick Piggin <[email protected]> 16.11.09 05:14 >>>
>On Fri, Nov 13, 2009 at 11:54:40AM +0000, Jan Beulich wrote:
>> Rather than having X86_L1_CACHE_BYTES and X86_L1_CACHE_SHIFT (with
>> inconsistent defaults), just having the latter suffices as the former
>> can be easily calculated from it.
>>
>> To be consistent, also change X86_INTERNODE_CACHE_BYTES to
>> X86_INTERNODE_CACHE_SHIFT, and set it to 7 (128 bytes) for NUMA to
>> account for last level cache line size (which here matters more than
>> L1 cache line size).
>
>I think if we're going to set it to 7 (128B, for Pentium 4), then
>we should set the L1 cache shift as well? Most alignments to
>prevent cacheline pingpong use L1 cache shift for this anyway?

But for P4 L1_CACHE_SHIFT already is 7.

>The internode thing is really just a not quite well defined thing
>because internode cachelines are really expensive and really big
>on vsmp so they warrant trading off extra space on some critical
>structures to reduce pingpong (but this is not to say that other
>structures that are *not* internode annotated do *not* need to
>worry about pingpong).

The internode one, as said in the patch description, should consider
the last level cache line size rather than L1, which 128 seems to be
a much better fit (without in introducing model dependencies like
for L1) than just using the L1 value directly.

Jan

2009-11-16 10:56:54

by Nick Piggin

[permalink] [raw]

Subject: Re: [PATCH] x86: eliminate redundant/contradicting cache line size config options

On Mon, Nov 16, 2009 at 08:08:07AM +0000, Jan Beulich wrote:
> >>> Nick Piggin <[email protected]> 16.11.09 05:14 >>>
> >On Fri, Nov 13, 2009 at 11:54:40AM +0000, Jan Beulich wrote:
> >> Rather than having X86_L1_CACHE_BYTES and X86_L1_CACHE_SHIFT (with
> >> inconsistent defaults), just having the latter suffices as the former
> >> can be easily calculated from it.
> >>
> >> To be consistent, also change X86_INTERNODE_CACHE_BYTES to
> >> X86_INTERNODE_CACHE_SHIFT, and set it to 7 (128 bytes) for NUMA to
> >> account for last level cache line size (which here matters more than
> >> L1 cache line size).
> >
> >I think if we're going to set it to 7 (128B, for Pentium 4), then
> >we should set the L1 cache shift as well? Most alignments to
> >prevent cacheline pingpong use L1 cache shift for this anyway?
>
> But for P4 L1_CACHE_SHIFT already is 7.

I was more talking about the GENERIC defaults, wihch is now 64.
There is no point in making it 128B on a system with 64B cachelines.
So it should be the same as L1_CACHE_BYTES, and I guess we don't
need to further debate the 64B size for GENERIC kernels now because
0a2a18b721abc960fbcada406746877d22340a60 already decided it should
be 64.

> >The internode thing is really just a not quite well defined thing
> >because internode cachelines are really expensive and really big
> >on vsmp so they warrant trading off extra space on some critical
> >structures to reduce pingpong (but this is not to say that other
> >structures that are *not* internode annotated do *not* need to
> >worry about pingpong).
>
> The internode one, as said in the patch description, should consider
> the last level cache line size rather than L1, which 128 seems to be
> a much better fit (without in introducing model dependencies like
> for L1) than just using the L1 value directly.

By far the biggest user has always been to avoid cacheline ping
pong, so LLC size is more important there and IMO the L1 cache
size macro should always work best with the largest line size in
the hierarchy.

Internode was introduced because making it 4096 everywhere was
probably too prohibitive, so only the worst ones were getting
converted (this situation is probably far better now with much
better per-cpu structures and more dynamic allocation (ie. no
more [NR_CPUS] arrays of cacheline_aligned_in_smp to speak of).

The only other use for L1 cache size macro is to pack objects
to cachelines better (so they always use the fewest number of
lines). But this case is more rare nowadays people don't really
count cachelines anymore, but I think even then it makes sense
for it to be the largest line size in the system because we
don't know how big L1s are, and if you want opimal L1 packing,
you likely also want optimal Ln packing.

2009-11-19 03:56:51

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH] x86: eliminate redundant/contradicting cache line size config options

* Nick Piggin <[email protected]> wrote:

> The only other use for L1 cache size macro is to pack objects to
> cachelines better (so they always use the fewest number of lines). But
> this case is more rare nowadays people don't really count cachelines
> anymore, but I think even then it makes sense for it to be the largest
> line size in the system because we don't know how big L1s are, and if
> you want opimal L1 packing, you likely also want optimal Ln packing.

We could do that - but then this default of X86_INTERNODE_CACHE_SHIFT:

+ default "7" if NUMA

will bite us and turns the 64 bytes L1_CACHE_BYTES into an effective 128
bytes value.

So ... are you arguing for an increase of the default x86 linesize to
128 bytes?

If yes then i'm not 100% against it, but we need more data and a careful
analysis of the bloat effect: a vmlinux comparizon plus an estimation of
any dynamic bloat effects on this in a representative bootup with an
enterprise distro config. (SLAB uses the dynamic cacheline size value so
it's not affected - but there are some other runtime dependencies on
these kconfig values in the kernel.)

Furthermore, if we do it (double the default line size on x86), then we
in essence will standardize almost everything on
X86_INTERNODE_CACHE_SHIFT and CACHE_BYTES loses much of its meaning. If
we do then we need a vSMP measurement of the effects of this (and an Ack
from the vSMP folks) - it might work - or it might not.

If that all works out fine (which is a big if) then we can also
eliminate INTERNODE_CACHE_SHIFT and just have a single
CACHE_SHIFT/CACHE_BYTES arch tunable.

In any case i will apply Jan's current patch as it certainly is a step
forward that corrects a few inconsistencies and streamlines the status
quo. We can then do another patch to change the status quo.

Thanks,

Ingo

2009-11-19 04:43:45

by Jan Beulich

[permalink] [raw]

Subject: [tip:x86/mm] x86: Eliminate redundant/contradicting cache line size config options

Commit-ID: 350f8f5631922c7848ec4b530c111cb8c2ff7caa
Gitweb: http://git.kernel.org/tip/350f8f5631922c7848ec4b530c111cb8c2ff7caa
Author: Jan Beulich <[email protected]>
AuthorDate: Fri, 13 Nov 2009 11:54:40 +0000
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 19 Nov 2009 04:58:34 +0100

x86: Eliminate redundant/contradicting cache line size config options

Rather than having X86_L1_CACHE_BYTES and X86_L1_CACHE_SHIFT
(with inconsistent defaults), just having the latter suffices as
the former can be easily calculated from it.

To be consistent, also change X86_INTERNODE_CACHE_BYTES to
X86_INTERNODE_CACHE_SHIFT, and set it to 7 (128 bytes) for NUMA
to account for last level cache line size (which here matters
more than L1 cache line size).

Finally, make sure the default value for X86_L1_CACHE_SHIFT,
when X86_GENERIC is selected, is being seen before that for the
individual CPU model options (other than on x86-64, where
GENERIC_CPU is part of the choice construct, X86_GENERIC is a
separate option on ix86).

Signed-off-by: Jan Beulich <[email protected]>
Acked-by: Ravikiran Thirumalai <[email protected]>
Acked-by: Nick Piggin <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/Kconfig.cpu | 14 +++++---------
arch/x86/boot/compressed/vmlinux.lds.S | 3 ++-
arch/x86/include/asm/cache.h | 7 ++++---
arch/x86/kernel/vmlinux.lds.S | 10 +++++-----
arch/x86/mm/tlb.c | 3 ++-
5 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index f2824fb..621f2bd 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -301,15 +301,11 @@ config X86_CPU

#
# Define implied options from the CPU selection here
-config X86_L1_CACHE_BYTES
+config X86_INTERNODE_CACHE_SHIFT
int
- default "128" if MPSC
- default "64" if GENERIC_CPU || MK8 || MCORE2 || MATOM || X86_32
-
-config X86_INTERNODE_CACHE_BYTES
- int
- default "4096" if X86_VSMP
- default X86_L1_CACHE_BYTES if !X86_VSMP
+ default "12" if X86_VSMP
+ default "7" if NUMA
+ default X86_L1_CACHE_SHIFT

config X86_CMPXCHG
def_bool X86_64 || (X86_32 && !M386)
@@ -317,9 +313,9 @@ config X86_CMPXCHG
config X86_L1_CACHE_SHIFT
int
default "7" if MPENTIUM4 || MPSC
+ default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU

config X86_XADD
def_bool y
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index f4193bb..a6f1a59 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -4,6 +4,7 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)

#undef i386

+#include <asm/cache.h>
#include <asm/page_types.h>

#ifdef CONFIG_X86_64
@@ -46,7 +47,7 @@ SECTIONS
*(.data.*)
_edata = . ;
}
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ . = ALIGN(L1_CACHE_BYTES);
.bss : {
_bss = . ;
*(.bss)
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
index 549860d..2f9047c 100644
--- a/arch/x86/include/asm/cache.h
+++ b/arch/x86/include/asm/cache.h
@@ -9,12 +9,13 @@

#define __read_mostly __attribute__((__section__(".data.read_mostly")))

+#define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT
+#define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT)
+
#ifdef CONFIG_X86_VSMP
-/* vSMP Internode cacheline shift */
-#define INTERNODE_CACHE_SHIFT (12)
#ifdef CONFIG_SMP
#define __cacheline_aligned_in_smp \
- __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \
+ __attribute__((__aligned__(INTERNODE_CACHE_BYTES))) \
__page_aligned_data
#endif
#endif
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index fd2dabe..eeb4f5f 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -135,13 +135,13 @@ SECTIONS

PAGE_ALIGNED_DATA(PAGE_SIZE)

- CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES)
+ CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)

DATA_DATA
CONSTRUCTORS

/* rarely changed data like cpu maps */
- READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES)
+ READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)

/* End of data section */
_edata = .;
@@ -165,12 +165,12 @@ SECTIONS
*(.vsyscall_0)
} :user

- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ . = ALIGN(L1_CACHE_BYTES);
.vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
*(.vsyscall_fn)
}

- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ . = ALIGN(L1_CACHE_BYTES);
.vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
*(.vsyscall_gtod_data)
}
@@ -194,7 +194,7 @@ SECTIONS
}
vgetcpu_mode = VVIRT(.vgetcpu_mode);

- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ . = ALIGN(L1_CACHE_BYTES);
.jiffies : AT(VLOAD(.jiffies)) {
*(.jiffies)
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 36fe08e..65b58e4 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -8,6 +8,7 @@

#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
+#include <asm/cache.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>

@@ -43,7 +44,7 @@ union smp_flush_state {
spinlock_t tlbstate_lock;
DECLARE_BITMAP(flush_cpumask, NR_CPUS);
};
- char pad[CONFIG_X86_INTERNODE_CACHE_BYTES];
+ char pad[INTERNODE_CACHE_BYTES];
} ____cacheline_internodealigned_in_smp;

/* State is put into the per CPU data section, but padded

2009-11-19 04:51:00

by Arjan van de Ven

[permalink] [raw]

Subject: Re: [PATCH] x86: eliminate redundant/contradicting cache line size config options

On Thu, 19 Nov 2009 04:56:40 +0100
Ingo Molnar <[email protected]> wrote:

>
> * Nick Piggin <[email protected]> wrote:
>
> > The only other use for L1 cache size macro is to pack objects to
> > cachelines better (so they always use the fewest number of lines).
> > But this case is more rare nowadays people don't really count
> > cachelines anymore, but I think even then it makes sense for it to
> > be the largest line size in the system because we don't know how
> > big L1s are, and if you want opimal L1 packing, you likely also
> > want optimal Ln packing.
>
> We could do that - but then this default of X86_INTERNODE_CACHE_SHIFT:
>
> + default "7" if NUMA
>
> will bite us and turns the 64 bytes L1_CACHE_BYTES into an effective
> 128 bytes value.
>
> So ... are you arguing for an increase of the default x86 linesize to
> 128 bytes?

128 is basically always wrong.
(unless you have a P4... but for default really we should not care
about those anymore)

--
Arjan van de Ven Intel Open Source Technology Centre
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2009-11-19 08:13:05

by Nick Piggin

[permalink] [raw]

Subject: Re: [PATCH] x86: eliminate redundant/contradicting cache line size config options

On Wed, Nov 18, 2009 at 08:52:40PM -0800, Arjan van de Ven wrote:
> On Thu, 19 Nov 2009 04:56:40 +0100
> Ingo Molnar <[email protected]> wrote:
>
> >
> > * Nick Piggin <[email protected]> wrote:
> >
> > > The only other use for L1 cache size macro is to pack objects to
> > > cachelines better (so they always use the fewest number of lines).
> > > But this case is more rare nowadays people don't really count
> > > cachelines anymore, but I think even then it makes sense for it to
> > > be the largest line size in the system because we don't know how
> > > big L1s are, and if you want opimal L1 packing, you likely also
> > > want optimal Ln packing.
> >
> > We could do that - but then this default of X86_INTERNODE_CACHE_SHIFT:
> >
> > + default "7" if NUMA
> >
> > will bite us and turns the 64 bytes L1_CACHE_BYTES into an effective
> > 128 bytes value.
> >
> > So ... are you arguing for an increase of the default x86 linesize to
> > 128 bytes?

Basically what I think we should do is consider L1_CACHE_BYTES to be
*the* correct default value to use for 1) avoiding false sharing (which
seems to be the most common use), and 2) optimal and repeatable per-object
packing into cachelines (which is more of a micro-optimization to be
applied carefully to really critical structures).

But in either case, I think basicaly with these semantics, it makes mots
sense to use the largest line size in the system here (ie. "L1" isn't
really a good semantic because generic code still doesn't really know
enough to do much with that).

And then, I believe INTERNODE should basically just remain as a way
for vSMP to fine tune the size/speed tradeoffs on their platform, which
makes sense because it is so dissimilar to everyone else.

So with !VSMP, I think INTERNODE should just be the same as L1_CACHE_BYTES.

So this was my main point.

> 128 is basically always wrong.
> (unless you have a P4... but for default really we should not care
> about those anymore)

My other point was just this, but I don't care too much. But it is worded
pretty negatively. The key here is that increasing the value too large
tends to only cost a very small amount of size (and no increase in cacheline
foot print, only RAM). Wheras making the value too small could hurt
scalability by orders of magnitude in the worst case.

For a generic optimised kernel that is claiming to support P4, I think 128
has to be the correct value. If we want an option that supports a generic
for everything but P4 option, fine, but I have my doubts that 128 matters
that much.

Not such a relevant question for mainline, but are distros going to use
64 soon? I would certainly hate to have a customer "upgrade" a big P4
system to a much slower kernel that can't be fixed because of ABI issues,
even if there are only one or two of them left out there.

I don't know, I don't worry about this second point so much though :)

2009-11-19 08:38:12

by Jan Beulich

[permalink] [raw]

Subject: Re: [PATCH] x86: eliminate redundant/contradicting cache line size config options

>>> Nick Piggin <[email protected]> 19.11.09 09:13 >>>
>On Wed, Nov 18, 2009 at 08:52:40PM -0800, Arjan van de Ven wrote:
>Basically what I think we should do is consider L1_CACHE_BYTES to be
>*the* correct default value to use for 1) avoiding false sharing (which
>seems to be the most common use), and 2) optimal and repeatable per-object
>packing into cachelines (which is more of a micro-optimization to be
>applied carefully to really critical structures).

But then this really shouldn't be called L1_CACHE_... Though I realize
that the naming seems to already be broken - looking over the cache
line specifiers for CPUID leaf 2, there's really no L1 with 128 byte lines,
just two L2s.

One question however is whether e.g. cache line ping-pong between
L3s is really costing that much on non-NUMA, as opposed to it
happening between L1s.

Jan

2009-11-19 10:00:53

by Nick Piggin

[permalink] [raw]

Subject: Re: [PATCH] x86: eliminate redundant/contradicting cache line size config options

On Thu, Nov 19, 2009 at 08:38:14AM +0000, Jan Beulich wrote:
> >>> Nick Piggin <[email protected]> 19.11.09 09:13 >>>
> >On Wed, Nov 18, 2009 at 08:52:40PM -0800, Arjan van de Ven wrote:
> >Basically what I think we should do is consider L1_CACHE_BYTES to be
> >*the* correct default value to use for 1) avoiding false sharing (which
> >seems to be the most common use), and 2) optimal and repeatable per-object
> >packing into cachelines (which is more of a micro-optimization to be
> >applied carefully to really critical structures).
>
> But then this really shouldn't be called L1_CACHE_... Though I realize
> that the naming seems to already be broken - looking over the cache
> line specifiers for CPUID leaf 2, there's really no L1 with 128 byte lines,
> just two L2s.

Yes, I agree L1_CACHE is not the best name. In what situation would
you *only* care about L1 cache line size, without knowing any other
line sizes? IMO only in the case where you also know more details
about L1 cache like size and write some particular cache blocking or
algorithm like that. And we don't really do that in kernel, especially
not in generic code.

> One question however is whether e.g. cache line ping-pong between
> L3s is really costing that much on non-NUMA, as opposed to it
> happening between L1s.

Well I think we still need to work to minimise intra-chip bouncing
even though it is far cheaper than inter-chip. It is still costly
and probably costs more power too. And as core count continues to
increase, I think even intra-chip bouncing costs are going to become
important (8 core Nehalem I think already doesn't have a true
unified L3 cache with crossbars to each core but has 8 L3 caches
connected with ring busses).

I don't think it makes too much sense to add much complexity to
say "oh we don't care about bouncing between threads on core or
cores on chip" because I haven't seen anywhere we can get a
significant data size benefit, and it often slows down the straight
line performance too (eg. per-cpu variable can often be non atomic,
but when you even share it between threads on a core then you have
to start using atomics).

2009-11-19 15:58:22

by Arjan van de Ven

[permalink] [raw]

Subject: Re: [PATCH] x86: eliminate redundant/contradicting cache line size config options

On Thu, 19 Nov 2009 09:13:07 +0100
Nick Piggin <[email protected]> wrote:
>
> My other point was just this, but I don't care too much. But it is
> worded pretty negatively. The key here is that increasing the value
> too large tends to only cost a very small amount of size (and no
> increase in cacheline foot print, only RAM).

128 has a pretty significant impact on TPC-C benchmarks.....
it was the top issue until mainline fixed it to default to 64

2009-11-19 16:18:05

by Nick Piggin

[permalink] [raw]

Subject: Re: [PATCH] x86: eliminate redundant/contradicting cache line size config options

On Thu, Nov 19, 2009 at 07:59:58AM -0800, Arjan van de Ven wrote:
> On Thu, 19 Nov 2009 09:13:07 +0100
> Nick Piggin <[email protected]> wrote:
> >
> > My other point was just this, but I don't care too much. But it is
> > worded pretty negatively. The key here is that increasing the value
> > too large tends to only cost a very small amount of size (and no
> > increase in cacheline foot print, only RAM).
>
> 128 has a pretty significant impact on TPC-C benchmarks.....
> it was the top issue until mainline fixed it to default to 64

Really? I'm surprised, how much was it?

AFAIKS, in any case that 128 byte alignment is used, cache footprint
should not increased on a 64B line system, over 64 byte alignment.

I do see a silly thing in slab code that does not build a 192 byte
kmalloc slab in the case L1 cache bytes is 128 (it should build the
slab for the possibility that we've booted on a 64 byte system).
That might be increasing memory footprint a bit. But I still don't
see that cache footprint should be increased.

TLB, perhaps, because of increased memory usage. But I would have
thought memory usage increase should be pretty miniscule.

2009-11-19 17:53:54

by Andi Kleen

[permalink] [raw]

Subject: Re: [PATCH] x86: eliminate redundant/contradicting cache line size config options

Nick Piggin <[email protected]> writes:
>
> AFAIKS, in any case that 128 byte alignment is used, cache footprint
> should not increased on a 64B line system, over 64 byte alignment.

Yes.

It's probably some silly bug somewhere. I don't think we should
completely abandon P4 systems (of which there are still plenty around)
just for silly bugs, that can be probably properly fixed.

-Andi

--
[email protected] -- Speaking for myself only.

2009-11-23 08:35:18

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH] x86: eliminate redundant/contradicting cache line size config options

* Arjan van de Ven <[email protected]> wrote:

> On Thu, 19 Nov 2009 09:13:07 +0100
> Nick Piggin <[email protected]> wrote:
> >
> > My other point was just this, but I don't care too much. But it is
> > worded pretty negatively. The key here is that increasing the value
> > too large tends to only cost a very small amount of size (and no
> > increase in cacheline foot print, only RAM).
>
> 128 has a pretty significant impact on TPC-C benchmarks.....
> it was the top issue until mainline fixed it to default to 64

Mind sending a patch that sets the default to 64 on NUMA too?

P4 based NUMA boxes are ... a bad memory to be forgotten.

Ingo

2009-11-23 09:35:31

by Nick Piggin

[permalink] [raw]

Subject: Re: [PATCH] x86: eliminate redundant/contradicting cache line size config options

On Mon, Nov 23, 2009 at 09:34:59AM +0100, Ingo Molnar wrote:
>
> * Arjan van de Ven <[email protected]> wrote:
>
> > On Thu, 19 Nov 2009 09:13:07 +0100
> > Nick Piggin <[email protected]> wrote:
> > >
> > > My other point was just this, but I don't care too much. But it is
> > > worded pretty negatively. The key here is that increasing the value
> > > too large tends to only cost a very small amount of size (and no
> > > increase in cacheline foot print, only RAM).
> >
> > 128 has a pretty significant impact on TPC-C benchmarks.....
> > it was the top issue until mainline fixed it to default to 64
>
> Mind sending a patch that sets the default to 64 on NUMA too?

This is what I mean. It should all be the same value, and that
value should depend on the architectures to support (rather than
NUMA or something like that). With the internode simply being
the exception for the exceptional vSMP architecture.

> P4 based NUMA boxes are ... a bad memory to be forgotten.

I still think it would make sense to do this via Kconfig rather
than implicitly saying that we don't care about P4s even if
the user has apparently wanted to support them.

2009-11-23 10:09:56

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH] x86: eliminate redundant/contradicting cache line size config options

* Nick Piggin <[email protected]> wrote:

> On Mon, Nov 23, 2009 at 09:34:59AM +0100, Ingo Molnar wrote:
> >
> > * Arjan van de Ven <[email protected]> wrote:
> >
> > > On Thu, 19 Nov 2009 09:13:07 +0100
> > > Nick Piggin <[email protected]> wrote:
> > > >
> > > > My other point was just this, but I don't care too much. But it is
> > > > worded pretty negatively. The key here is that increasing the value
> > > > too large tends to only cost a very small amount of size (and no
> > > > increase in cacheline foot print, only RAM).
> > >
> > > 128 has a pretty significant impact on TPC-C benchmarks.....
> > > it was the top issue until mainline fixed it to default to 64
> >
> > Mind sending a patch that sets the default to 64 on NUMA too?
>
> This is what I mean. It should all be the same value, and that
> value should depend on the architectures to support (rather than
> NUMA or something like that). With the internode simply being
> the exception for the exceptional vSMP architecture.
>
>
> > P4 based NUMA boxes are ... a bad memory to be forgotten.
>
> I still think it would make sense to do this via Kconfig rather than
> implicitly saying that we don't care about P4s even if the user has
> apparently wanted to support them.

That was what i meant. Right now if P4 is set in the .config we'll use a
cache-shift of 7 - i.e. 128 byte cacheline size.

What we want is to remove that NUMA dependent quirk - it doesnt make
sense.

Ingo

2009-11-23 14:50:16

by Arjan van de Ven

[permalink] [raw]

Subject: Re: [PATCH] x86: eliminate redundant/contradicting cache line size config options

On Mon, 23 Nov 2009 09:34:59 +0100
Ingo Molnar <[email protected]> wrote:

>
> * Arjan van de Ven <[email protected]> wrote:
>
> > On Thu, 19 Nov 2009 09:13:07 +0100
> > Nick Piggin <[email protected]> wrote:
> > >
> > > My other point was just this, but I don't care too much. But it is
> > > worded pretty negatively. The key here is that increasing the
> > > value too large tends to only cost a very small amount of size
> > > (and no increase in cacheline foot print, only RAM).
> >
> > 128 has a pretty significant impact on TPC-C benchmarks.....
> > it was the top issue until mainline fixed it to default to 64
>
> Mind sending a patch that sets the default to 64 on NUMA too?
>
> P4 based NUMA boxes are ... a bad memory to be forgotten.

this patch adds a regression. Linux defaulted to 64 since.. march or so.

now we go back to the old setting; Nick should fix that. Or at least
extremely document and justify this change....

--
Arjan van de Ven Intel Open Source Technology Centre
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2009-11-23 15:15:28

by Nick Piggin

[permalink] [raw]

Subject: Re: [PATCH] x86: eliminate redundant/contradicting cache line size config options

On Mon, Nov 23, 2009 at 06:52:01AM -0800, Arjan van de Ven wrote:
> On Mon, 23 Nov 2009 09:34:59 +0100
> Ingo Molnar <[email protected]> wrote:
>
> >
> > * Arjan van de Ven <[email protected]> wrote:
> >
> > > On Thu, 19 Nov 2009 09:13:07 +0100
> > > Nick Piggin <[email protected]> wrote:
> > > >
> > > > My other point was just this, but I don't care too much. But it is
> > > > worded pretty negatively. The key here is that increasing the
> > > > value too large tends to only cost a very small amount of size
> > > > (and no increase in cacheline foot print, only RAM).
> > >
> > > 128 has a pretty significant impact on TPC-C benchmarks.....
> > > it was the top issue until mainline fixed it to default to 64
> >
> > Mind sending a patch that sets the default to 64 on NUMA too?
> >
> > P4 based NUMA boxes are ... a bad memory to be forgotten.
>
> this patch adds a regression. Linux defaulted to 64 since.. march or so.
>
> now we go back to the old setting; Nick should fix that. Or at least
> extremely document and justify this change....

Oh well I didn't think the change to 128 should be done without
discussion, and I just raised my opinion. You did make some good
counter points so perhaps mainline is best to continue with 64.
However it would be nice to get to the bottom of why it cost
so much on your OLTP.