SLUB should pack even small objects nicely into cachelines if that is what
has been asked for. Use the same algorithm as SLAB for this.
Signed-off-by: Nick Piggin <[email protected]>
---
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c
+++ linux-2.6/mm/slub.c
@@ -1896,12 +1896,15 @@ static unsigned long calculate_alignment
* specified alignment though. If that is greater
* then use it.
*/
- if ((flags & SLAB_HWCACHE_ALIGN) &&
- size > cache_line_size() / 2)
- return max_t(unsigned long, align, cache_line_size());
+ if (flags & SLAB_HWCACHE_ALIGN) {
+ unsigned long ralign = cache_line_size();
+ while (size <= ralign / 2)
+ ralign /= 2;
+ align = max(align, ralign);
+ }
if (align < ARCH_SLAB_MINALIGN)
- return ARCH_SLAB_MINALIGN;
+ align = ARCH_SLAB_MINALIGN;
return ALIGN(align, sizeof(void *));
}
Introduce SLAB_SMP_ALIGN, for allocations where false sharing must be
minimised on SMP systems.
Signed-off-by: Nick Piggin <[email protected]>
---
Index: linux-2.6/include/linux/slab.h
===================================================================
--- linux-2.6.orig/include/linux/slab.h
+++ linux-2.6/include/linux/slab.h
@@ -22,7 +22,8 @@
#define SLAB_RED_ZONE 0x00000400UL /* DEBUG: Red zone objs in a cache */
#define SLAB_POISON 0x00000800UL /* DEBUG: Poison objects */
#define SLAB_HWCACHE_ALIGN 0x00002000UL /* Align objs on cache lines */
-#define SLAB_CACHE_DMA 0x00004000UL /* Use GFP_DMA memory */
+#define SLAB_SMP_ALIGN 0x00004000UL /* Align on cachelines for SMP*/
+#define SLAB_CACHE_DMA 0x00008000UL /* Use GFP_DMA memory */
#define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */
#define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */
#define SLAB_DESTROY_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */
Index: linux-2.6/mm/slab.c
===================================================================
--- linux-2.6.orig/mm/slab.c
+++ linux-2.6/mm/slab.c
@@ -174,13 +174,13 @@
/* Legal flag mask for kmem_cache_create(). */
#if DEBUG
# define CREATE_MASK (SLAB_RED_ZONE | \
- SLAB_POISON | SLAB_HWCACHE_ALIGN | \
+ SLAB_POISON | SLAB_HWCACHE_ALIGN | SLAB_SMP_ALIGN | \
SLAB_CACHE_DMA | \
SLAB_STORE_USER | \
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
#else
-# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
+# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_SMP_ALIGN | \
SLAB_CACHE_DMA | \
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
@@ -2245,6 +2245,9 @@ kmem_cache_create (const char *name, siz
ralign = BYTES_PER_WORD;
}
+ if ((flags & SLAB_SMP_ALIGN) && num_possible_cpus() > 1)
+ ralign = max_t(unsigned long, ralign, cache_line_size());
+
/*
* Redzoning and user store require word alignment or possibly larger.
* Note this will be overridden by architecture or caller mandated
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c
+++ linux-2.6/mm/slub.c
@@ -1903,6 +1903,9 @@ static unsigned long calculate_alignment
align = max(align, ralign);
}
+ if ((flags & SLAB_SMP_ALIGN) && num_possible_cpus() > 1)
+ align = max_t(unsigned long, align, cache_line_size());
+
if (align < ARCH_SLAB_MINALIGN)
align = ARCH_SLAB_MINALIGN;
Use SLAB_SMP_ALIGN in a few places.
Signed-off-by: Nick Piggin <[email protected]>
---
Index: linux-2.6/fs/block_dev.c
===================================================================
--- linux-2.6.orig/fs/block_dev.c
+++ linux-2.6/fs/block_dev.c
@@ -332,8 +332,8 @@ void __init bdev_cache_init(void)
{
int err;
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
- 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_PANIC),
+ 0, (SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_PANIC),
init_once);
err = register_filesystem(&bd_type);
if (err)
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c
+++ linux-2.6/kernel/fork.c
@@ -1547,23 +1547,23 @@ void __init proc_caches_init(void)
{
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
+ SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
sighand_ctor);
signal_cachep = kmem_cache_create("signal_cache",
sizeof(struct signal_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|SLAB_PANIC, NULL);
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|SLAB_PANIC, NULL);
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|SLAB_PANIC, NULL);
vm_area_cachep = kmem_cache_create("vm_area_struct",
sizeof(struct vm_area_struct), 0,
SLAB_PANIC, NULL);
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|SLAB_PANIC, NULL);
}
/*
Index: linux-2.6/kernel/pid.c
===================================================================
--- linux-2.6.orig/kernel/pid.c
+++ linux-2.6/kernel/pid.c
@@ -526,5 +526,5 @@ void __init pidmap_init(void)
atomic_dec(&init_pid_ns.pidmap[0].nr_free);
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
- SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+ SLAB_HWCACHE_ALIGN | SLAB_SMP_ALIGN | SLAB_PANIC);
}
Index: linux-2.6/kernel/user.c
===================================================================
--- linux-2.6.orig/kernel/user.c
+++ linux-2.6/kernel/user.c
@@ -503,7 +503,7 @@ static int __init uid_cache_init(void)
int n;
uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ 0, SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|SLAB_PANIC, NULL);
for(n = 0; n < UIDHASH_SZ; ++n)
INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
Hi Nick,
On Mon, Mar 3, 2008 at 11:34 AM, Nick Piggin <[email protected]> wrote:
> SLUB should pack even small objects nicely into cachelines if that is what
> has been asked for. Use the same algorithm as SLAB for this.
The patches look good but this changelog is missing context. Why do we
want to do this?
On Mon, Mar 3, 2008 at 11:34 AM, Nick Piggin <[email protected]> wrote:
> @@ -1896,12 +1896,15 @@ static unsigned long calculate_alignment
> * specified alignment though. If that is greater
> * then use it.
> */
You might want to fix the above comment too.
> - if ((flags & SLAB_HWCACHE_ALIGN) &&
> - size > cache_line_size() / 2)
> - return max_t(unsigned long, align, cache_line_size());
> + if (flags & SLAB_HWCACHE_ALIGN) {
> + unsigned long ralign = cache_line_size();
> + while (size <= ralign / 2)
> + ralign /= 2;
> + align = max(align, ralign);
> + }
Nick Piggin a ?crit :
> Use SLAB_SMP_ALIGN in a few places.
>
>
I dont understand why you added SLAB_SMP_ALIGN, without removing
SLAB_HWCACHE_ALIGN on these places.
> Signed-off-by: Nick Piggin <[email protected]>
> ---
> Index: linux-2.6/fs/block_dev.c
> ===================================================================
> --- linux-2.6.orig/fs/block_dev.c
> +++ linux-2.6/fs/block_dev.c
> @@ -332,8 +332,8 @@ void __init bdev_cache_init(void)
> {
> int err;
> bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
> - 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
> - SLAB_MEM_SPREAD|SLAB_PANIC),
> + 0, (SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|
> + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_PANIC),
> init_once);
> err = register_filesystem(&bd_type);
> if (err)
> Index: linux-2.6/kernel/fork.c
> ===================================================================
> --- linux-2.6.orig/kernel/fork.c
> +++ linux-2.6/kernel/fork.c
> @@ -1547,23 +1547,23 @@ void __init proc_caches_init(void)
> {
> sighand_cachep = kmem_cache_create("sighand_cache",
> sizeof(struct sighand_struct), 0,
> - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
> + SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
> sighand_ctor);
> signal_cachep = kmem_cache_create("signal_cache",
> sizeof(struct signal_struct), 0,
> - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
> + SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|SLAB_PANIC, NULL);
> files_cachep = kmem_cache_create("files_cache",
> sizeof(struct files_struct), 0,
> - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
> + SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|SLAB_PANIC, NULL);
> fs_cachep = kmem_cache_create("fs_cache",
> sizeof(struct fs_struct), 0,
> - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
> + SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|SLAB_PANIC, NULL);
> vm_area_cachep = kmem_cache_create("vm_area_struct",
> sizeof(struct vm_area_struct), 0,
> SLAB_PANIC, NULL);
> mm_cachep = kmem_cache_create("mm_struct",
> sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
> - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
> + SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|SLAB_PANIC, NULL);
> }
>
> /*
> Index: linux-2.6/kernel/pid.c
> ===================================================================
> --- linux-2.6.orig/kernel/pid.c
> +++ linux-2.6/kernel/pid.c
> @@ -526,5 +526,5 @@ void __init pidmap_init(void)
> atomic_dec(&init_pid_ns.pidmap[0].nr_free);
>
> init_pid_ns.pid_cachep = KMEM_CACHE(pid,
> - SLAB_HWCACHE_ALIGN | SLAB_PANIC);
> + SLAB_HWCACHE_ALIGN | SLAB_SMP_ALIGN | SLAB_PANIC);
> }
> Index: linux-2.6/kernel/user.c
> ===================================================================
> --- linux-2.6.orig/kernel/user.c
> +++ linux-2.6/kernel/user.c
> @@ -503,7 +503,7 @@ static int __init uid_cache_init(void)
> int n;
>
> uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
> - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
> + 0, SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|SLAB_PANIC, NULL);
>
> for(n = 0; n < UIDHASH_SZ; ++n)
> INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
>
On Mon, Mar 03, 2008 at 11:44:07AM +0200, Pekka Enberg wrote:
> Hi Nick,
>
> On Mon, Mar 3, 2008 at 11:34 AM, Nick Piggin <[email protected]> wrote:
> > SLUB should pack even small objects nicely into cachelines if that is what
> > has been asked for. Use the same algorithm as SLAB for this.
>
> The patches look good but this changelog is missing context. Why do we
> want to do this?
You're right, the changelog should be improved if/when it is merged.
Actually the context is discussed in another thread just now on lkml
if you are interested ("alloc_percpu() fails to allocate percpu data").
>
> On Mon, Mar 3, 2008 at 11:34 AM, Nick Piggin <[email protected]> wrote:
> > @@ -1896,12 +1896,15 @@ static unsigned long calculate_alignment
> > * specified alignment though. If that is greater
> > * then use it.
> > */
>
> You might want to fix the above comment too.
Yes.
>
> > - if ((flags & SLAB_HWCACHE_ALIGN) &&
> > - size > cache_line_size() / 2)
> > - return max_t(unsigned long, align, cache_line_size());
> > + if (flags & SLAB_HWCACHE_ALIGN) {
> > + unsigned long ralign = cache_line_size();
> > + while (size <= ralign / 2)
> > + ralign /= 2;
> > + align = max(align, ralign);
> > + }
On Mon, Mar 03, 2008 at 10:53:52AM +0100, Eric Dumazet wrote:
> Nick Piggin a ?crit :
> >Use SLAB_SMP_ALIGN in a few places.
> >
> >
>
> I dont understand why you added SLAB_SMP_ALIGN, without removing
> SLAB_HWCACHE_ALIGN on these places.
Because I thought that in most of the cases, we also want some cacheline
alignment on UP systems as well because we care about the layout of the
structure WRT the cachelines for the mandatory/capacity miss cases, as
well as wanting to avoid false sharing misses on SMP.
Actually I didn't think _too_ hard about them, possibly some could be
removed. But the problem is that these things do require careful
thought so I should not change them unless I have done that ;)
I guess there are some basic guidelines -- if size is a problem (ie if
there can be lots of these structures), then that is going to be a
factor; if the total pool of objects is likely to be fairly densely
resident in cache, then it will start to favour dense packing rather
than good alignment.
>
> >Signed-off-by: Nick Piggin <[email protected]>
> >---
> >Index: linux-2.6/fs/block_dev.c
> >===================================================================
> >--- linux-2.6.orig/fs/block_dev.c
> >+++ linux-2.6/fs/block_dev.c
> >@@ -332,8 +332,8 @@ void __init bdev_cache_init(void)
> > {
> > int err;
> > bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct
> > bdev_inode),
> >- 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
> >- SLAB_MEM_SPREAD|SLAB_PANIC),
> >+ 0, (SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|
> >+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_PANIC),
> > init_once);
> > err = register_filesystem(&bd_type);
> > if (err)
> >Index: linux-2.6/kernel/fork.c
> >===================================================================
> >--- linux-2.6.orig/kernel/fork.c
> >+++ linux-2.6/kernel/fork.c
> >@@ -1547,23 +1547,23 @@ void __init proc_caches_init(void)
> > {
> > sighand_cachep = kmem_cache_create("sighand_cache",
> > sizeof(struct sighand_struct), 0,
> >- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
> >+
> >SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
> > sighand_ctor);
> > signal_cachep = kmem_cache_create("signal_cache",
> > sizeof(struct signal_struct), 0,
> >- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
> >+ SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|SLAB_PANIC, NULL);
> > files_cachep = kmem_cache_create("files_cache",
> > sizeof(struct files_struct), 0,
> >- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
> >+ SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|SLAB_PANIC, NULL);
> > fs_cachep = kmem_cache_create("fs_cache",
> > sizeof(struct fs_struct), 0,
> >- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
> >+ SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|SLAB_PANIC, NULL);
> > vm_area_cachep = kmem_cache_create("vm_area_struct",
> > sizeof(struct vm_area_struct), 0,
> > SLAB_PANIC, NULL);
> > mm_cachep = kmem_cache_create("mm_struct",
> > sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
> >- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
> >+ SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|SLAB_PANIC, NULL);
> > }
> >
> > /*
> >Index: linux-2.6/kernel/pid.c
> >===================================================================
> >--- linux-2.6.orig/kernel/pid.c
> >+++ linux-2.6/kernel/pid.c
> >@@ -526,5 +526,5 @@ void __init pidmap_init(void)
> > atomic_dec(&init_pid_ns.pidmap[0].nr_free);
> >
> > init_pid_ns.pid_cachep = KMEM_CACHE(pid,
> >- SLAB_HWCACHE_ALIGN | SLAB_PANIC);
> >+ SLAB_HWCACHE_ALIGN | SLAB_SMP_ALIGN | SLAB_PANIC);
> > }
> >Index: linux-2.6/kernel/user.c
> >===================================================================
> >--- linux-2.6.orig/kernel/user.c
> >+++ linux-2.6/kernel/user.c
> >@@ -503,7 +503,7 @@ static int __init uid_cache_init(void)
> > int n;
> >
> > uid_cachep = kmem_cache_create("uid_cache", sizeof(struct
> > user_struct),
> >- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
> >+ 0, SLAB_HWCACHE_ALIGN|SLAB_SMP_ALIGN|SLAB_PANIC,
> >NULL);
> >
> > for(n = 0; n < UIDHASH_SZ; ++n)
> > INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
> >--
> >To unsubscribe from this list: send the line "unsubscribe netdev" in
> >the body of a message to [email protected]
> >More majordomo info at http://vger.kernel.org/majordomo-info.html
> >
> >
> >
>
>
>
Nick Piggin a ?crit :
> On Mon, Mar 03, 2008 at 10:53:52AM +0100, Eric Dumazet wrote:
>
>> Nick Piggin a ?crit :
>>
>>> Use SLAB_SMP_ALIGN in a few places.
>>>
>>>
>>>
>> I dont understand why you added SLAB_SMP_ALIGN, without removing
>> SLAB_HWCACHE_ALIGN on these places.
>>
>
> Because I thought that in most of the cases, we also want some cacheline
> alignment on UP systems as well because we care about the layout of the
> structure WRT the cachelines for the mandatory/capacity miss cases, as
> well as wanting to avoid false sharing misses on SMP.
>
> Actually I didn't think _too_ hard about them, possibly some could be
> removed. But the problem is that these things do require careful
> thought so I should not change them unless I have done that ;)
>
> I guess there are some basic guidelines -- if size is a problem (ie if
> there can be lots of these structures), then that is going to be a
> factor; if the total pool of objects is likely to be fairly densely
> resident in cache, then it will start to favour dense packing rather
> than good alignment.
>
>
Well, if a kmem_cache_create() is used, this is probably because number
of objects can be large, so kmalloc() power-of-two granularity could
waste lot of ram.
But yes, you are right that SLAB_SMP_ALIGN doesnt imply SLAB_HWCACHE_ALIGN
- SMP_ALIGN is a hint about false sharing (when object contains a refcnt
for example), that is a concern only if
num_possible_cpus() > 1
While HWCACHE_ALIGN might be a hint saying :
- The writer carefully designed the structure so that max performance is
obtained when all objects starts on a cache line boundary, even on
Uniprocessor.
But I suspect some uses of HWCACHE_ALIGN are not a hint but a strong
requirement.
Maybe we need to use three flags to separate the meanings ?
SLAB_HINT_SMP_ALIGN
SLAB_HINT_HWCACHE_ALIGN
SLAB_HWCACHE_ALIGN /* strong requirement that two objects dont share a
cache line */
On Mon, Mar 03, 2008 at 02:00:51PM +0100, Eric Dumazet wrote:
> Nick Piggin a ?crit :
> >On Mon, Mar 03, 2008 at 10:53:52AM +0100, Eric Dumazet wrote:
> >
> >>Nick Piggin a ?crit :
> >>
> >>>Use SLAB_SMP_ALIGN in a few places.
> >>>
> >>>
> >>>
> >>I dont understand why you added SLAB_SMP_ALIGN, without removing
> >>SLAB_HWCACHE_ALIGN on these places.
> >>
> >
> >Because I thought that in most of the cases, we also want some cacheline
> >alignment on UP systems as well because we care about the layout of the
> >structure WRT the cachelines for the mandatory/capacity miss cases, as
> >well as wanting to avoid false sharing misses on SMP.
> >
> >Actually I didn't think _too_ hard about them, possibly some could be
> >removed. But the problem is that these things do require careful
> >thought so I should not change them unless I have done that ;)
> >
> >I guess there are some basic guidelines -- if size is a problem (ie if
> >there can be lots of these structures), then that is going to be a
> >factor; if the total pool of objects is likely to be fairly densely
> >resident in cache, then it will start to favour dense packing rather
> >than good alignment.
> >
> >
> Well, if a kmem_cache_create() is used, this is probably because number
> of objects can be large, so kmalloc() power-of-two granularity could
> waste lot of ram.
Or because you want explicit control over alignment ;)
> But yes, you are right that SLAB_SMP_ALIGN doesnt imply SLAB_HWCACHE_ALIGN
>
> - SMP_ALIGN is a hint about false sharing (when object contains a refcnt
> for example), that is a concern only if
>
> num_possible_cpus() > 1
>
> While HWCACHE_ALIGN might be a hint saying :
> - The writer carefully designed the structure so that max performance is
> obtained when all objects starts on a cache line boundary, even on
> Uniprocessor.
Yes. In which case, we are also happy if the objects are small if they
share cachelines (so long as they are still nicely aligned, which slub
currently does not do)... unless SMP_ALIGN is set, of course.
> But I suspect some uses of HWCACHE_ALIGN are not a hint but a strong
> requirement.
>
> Maybe we need to use three flags to separate the meanings ?
>
>
> SLAB_HINT_SMP_ALIGN
> SLAB_HINT_HWCACHE_ALIGN
> SLAB_HWCACHE_ALIGN /* strong requirement that two objects dont share a
> cache line */
Possibly, but I'm beginning to prefer that strong requirements should
request the explicit alignment (they can even use cache_line_size() after
Pekka's patch to make it generic). I don't like how the name implies
that you get a guarantee, however I guess in practice people are using it
more as a hint (or because they vaguely hope it makes their code run
faster :))
So I wouldn't be adverse to a rename...
Hi,
On Mon, Mar 3, 2008 at 3:46 PM, Nick Piggin <[email protected]> wrote:
> > Maybe we need to use three flags to separate the meanings ?
> >
> > SLAB_HINT_SMP_ALIGN
> > SLAB_HINT_HWCACHE_ALIGN
> > SLAB_HWCACHE_ALIGN /* strong requirement that two objects dont share a
> > cache line */
>
> Possibly, but I'm beginning to prefer that strong requirements should
> request the explicit alignment (they can even use cache_line_size() after
> Pekka's patch to make it generic). I don't like how the name implies
> that you get a guarantee, however I guess in practice people are using it
> more as a hint (or because they vaguely hope it makes their code run
> faster :))
At least historically SLAB_HWCACHE_ALIGN has been just a hint,
although slab tries very hard to satisfy it (see the comments in
mm/slab.c). Why do we need stronger guarantees than that, btw?
Pekka Enberg a ?crit :
> Hi,
>
> On Mon, Mar 3, 2008 at 3:46 PM, Nick Piggin <[email protected]> wrote:
>
>> > Maybe we need to use three flags to separate the meanings ?
>> >
>> > SLAB_HINT_SMP_ALIGN
>> > SLAB_HINT_HWCACHE_ALIGN
>> > SLAB_HWCACHE_ALIGN /* strong requirement that two objects dont share a
>> > cache line */
>>
>> Possibly, but I'm beginning to prefer that strong requirements should
>> request the explicit alignment (they can even use cache_line_size() after
>> Pekka's patch to make it generic). I don't like how the name implies
>> that you get a guarantee, however I guess in practice people are using it
>> more as a hint (or because they vaguely hope it makes their code run
>> faster :))
>>
>
> At least historically SLAB_HWCACHE_ALIGN has been just a hint,
> although slab tries very hard to satisfy it (see the comments in
> mm/slab.c). Why do we need stronger guarantees than that, btw?
>
>
This reminds me a previous attempt of removing SLAB_HWCACHE_ALIGN
http://kernel.org/pub/linux/kernel/people/christoph/patch-archive/2007/2.6.21-rc6/remove_hwcache_align
At that time Christoph didnt took into account the CONFIG_SMP thing
(false sharing avoidance), but also that L1_CACHE_SIZE is a compile
constant, that can differs with cache_line_size()
On Mon, 3 Mar 2008, Nick Piggin wrote:
> Introduce SLAB_SMP_ALIGN, for allocations where false sharing must be
> minimised on SMP systems.
Mandatory alignment are specified as a parameter to kmem_cache_create not
as flags. Ycan specify the alignment as a parameter. i.e. L1_CACHE_BYTES
or cache_line_size().
Maybe we should drop SLAB_HWCACHE_ALIGN because of the confusion is
creates because it seems that flags can be used to enforce alignments?
On Mon, 3 Mar 2008, Nick Piggin wrote:
> SLUB should pack even small objects nicely into cachelines if that is what
> has been asked for. Use the same algorithm as SLAB for this.
Why? SLAB does not cacheline align objects smaller than cache_line_size()
/2. If they are already not cache line aligned then we can make them as
dense as possible. That is what SLUB does.
On Mon, 3 Mar 2008, Pekka Enberg wrote:
> At least historically SLAB_HWCACHE_ALIGN has been just a hint,
> although slab tries very hard to satisfy it (see the comments in
> mm/slab.c). Why do we need stronger guarantees than that, btw?
Its a hint. Alignment is specified as a parameter to kmem_cache_create not
as a flag. Maybe we could remove SLAB_HWCACHE_ALIGN because it is causing
so much confusion?
On Mon, 3 Mar 2008, Eric Dumazet wrote:
> This reminds me a previous attempt of removing SLAB_HWCACHE_ALIGN
>
> http://kernel.org/pub/linux/kernel/people/christoph/patch-archive/2007/2.6.21-rc6/remove_hwcache_align
>
> At that time Christoph didnt took into account the CONFIG_SMP thing (false
> sharing avoidance), but also that L1_CACHE_SIZE is a compile constant, that
> can differs with cache_line_size()
If cache_line_size() is universally available then we can specify it as
the alignment parameter to kmem_cache_create(). That would allow removal
of SLAB_HWCACHE_ALIGN.
On Mon, Mar 03, 2008 at 11:06:55AM -0800, Christoph Lameter wrote:
> On Mon, 3 Mar 2008, Nick Piggin wrote:
>
> > Introduce SLAB_SMP_ALIGN, for allocations where false sharing must be
> > minimised on SMP systems.
>
> Mandatory alignment are specified as a parameter to kmem_cache_create not
> as flags.
Not after this patch.
> Ycan specify the alignment as a parameter. i.e. L1_CACHE_BYTES
> or cache_line_size().
We don't want either of those, though.
I don't see why this is a problem to do inside slab.
On Mon, Mar 03, 2008 at 11:08:44AM -0800, Christoph Lameter wrote:
> On Mon, 3 Mar 2008, Nick Piggin wrote:
>
> > SLUB should pack even small objects nicely into cachelines if that is what
> > has been asked for. Use the same algorithm as SLAB for this.
>
> Why? SLAB does not cacheline align objects smaller than cache_line_size()
> /2.
It still doesn't after this patch.
> If they are already not cache line aligned then we can make them as
> dense as possible. That is what SLUB does.
Because when you specify HWCACHE_ALIGN, it means that you want the object
not to cross cacheline boundaries for at least cache_line_size() bytes.
SLAB does this. SLUB does not without this patch.
On Mon, 3 Mar 2008, Nick Piggin wrote:
> > Mandatory alignment are specified as a parameter to kmem_cache_create not
> > as flags.
>
> Not after this patch.
You want two ways of specifying alignments?
> > Ycan specify the alignment as a parameter. i.e. L1_CACHE_BYTES
> > or cache_line_size().
>
> We don't want either of those, though.
>
> I don't see why this is a problem to do inside slab.
I am not sure why you need to have two ways of specifying alignments.
On Mon, Mar 03, 2008 at 11:09:51AM -0800, Christoph Lameter wrote:
> On Mon, 3 Mar 2008, Pekka Enberg wrote:
>
> > At least historically SLAB_HWCACHE_ALIGN has been just a hint,
> > although slab tries very hard to satisfy it (see the comments in
> > mm/slab.c). Why do we need stronger guarantees than that, btw?
>
> Its a hint. Alignment is specified as a parameter to kmem_cache_create not
> as a flag. Maybe we could remove SLAB_HWCACHE_ALIGN because it is causing
> so much confusion?
Yeah, that's what I thought too, when I got confused by these new
SLUB semantics that you made up. Actually if you look at SLAB,
it has very precise and rational semantics. SLUB should respect that.
If you really configure for tiny memory footprint, then I'm fine with
it going away. In that respect, it is still a hint (the callers can't
rely on it being a particular alignment), and that also applies for
SLAB_SMP_ALIGN, in case you are concerned that flags must only be hints.
On Mon, 3 Mar 2008, Nick Piggin wrote:
> > If they are already not cache line aligned then we can make them as
> > dense as possible. That is what SLUB does.
>
> Because when you specify HWCACHE_ALIGN, it means that you want the object
> not to cross cacheline boundaries for at least cache_line_size() bytes.
> SLAB does this. SLUB does not without this patch.
HWCACHE_ALIGN means that you want the object to be aligned at
cacheline boundaries for optimization. Why does crossing cacheline
boundaries matter in this case?
On Mon, 3 Mar 2008, Nick Piggin wrote:
> Yeah, that's what I thought too, when I got confused by these new
> SLUB semantics that you made up. Actually if you look at SLAB,
> it has very precise and rational semantics. SLUB should respect that.
These crappy semantics are SLAB semantics that SLUB must support.
On Mon, Mar 03, 2008 at 12:09:43PM -0800, Christoph Lameter wrote:
> On Mon, 3 Mar 2008, Nick Piggin wrote:
>
> > > Mandatory alignment are specified as a parameter to kmem_cache_create not
> > > as flags.
> >
> > Not after this patch.
>
> You want two ways of specifying alignments?
I want a way to ask for SMP friendly allocation.
On Mon, Mar 03, 2008 at 12:10:59PM -0800, Christoph Lameter wrote:
> On Mon, 3 Mar 2008, Nick Piggin wrote:
>
> > > If they are already not cache line aligned then we can make them as
> > > dense as possible. That is what SLUB does.
> >
> > Because when you specify HWCACHE_ALIGN, it means that you want the object
> > not to cross cacheline boundaries for at least cache_line_size() bytes.
> > SLAB does this. SLUB does not without this patch.
>
> HWCACHE_ALIGN means that you want the object to be aligned at
> cacheline boundaries for optimization. Why does crossing cacheline
> boundaries matter in this case?
No, HWCACHE_ALIGN means that you want the object not to cross cacheline
boundaries for at least cache_line_size() bytes. You invented new
semantics with SLUB, but all the callers were written expecting the
existing semantics, presumably. So we should retain that behaviour, and
if you disagree with the actual callers, then that is fine but you
should discuss each case with the relevant people.
On Mon, 3 Mar 2008, Nick Piggin wrote:
> > You want two ways of specifying alignments?
>
> I want a way to ask for SMP friendly allocation.
Then align the objects at cacheline boundaries by providing a value for
the align parameter to kmem_cache_create().
On Mon, Mar 03, 2008 at 12:12:15PM -0800, Christoph Lameter wrote:
> On Mon, 3 Mar 2008, Nick Piggin wrote:
>
> > Yeah, that's what I thought too, when I got confused by these new
> > SLUB semantics that you made up. Actually if you look at SLAB,
> > it has very precise and rational semantics. SLUB should respect that.
>
> These crappy semantics are SLAB semantics that SLUB must support.
They are only crappy in SLUB. In SLAB they are actually useful.
On Mon, Mar 03, 2008 at 12:17:20PM -0800, Christoph Lameter wrote:
> On Mon, 3 Mar 2008, Nick Piggin wrote:
>
> > > You want two ways of specifying alignments?
> >
> > I want a way to ask for SMP friendly allocation.
>
> Then align the objects at cacheline boundaries by providing a value for
> the align parameter to kmem_cache_create().
max(num_possible_cpus() > 1 ? cache_line_size() : 0, mandatory_alignment)?
Then suppose we want a CONFIG_TINY option to eliminate it?
max(!CONFIG_TINY && num_possible_cpus() > 1 ? cache_line_size() : 0, mandatory_alignment)
And maybe the VSMP guys will want to blow this out to their internode
alignment?
max(!CONFIG_TINY && num_possible_cpus() > 1 ? (is_vsmp ? internode_alignemnt : cache_line_size()) : 0, mandatory_alignment)
On Mon, Mar 3, 2008 at 10:24 PM, Nick Piggin <[email protected]> wrote:
> > Then align the objects at cacheline boundaries by providing a value for
> > the align parameter to kmem_cache_create().
>
> max(num_possible_cpus() > 1 ? cache_line_size() : 0, mandatory_alignment)?
>
> Then suppose we want a CONFIG_TINY option to eliminate it?
Hmm... Can't we just fix SLAB_HWCACHE_ALIGN in SLUB to follow the
semantics of SLAB?
On Mon, 3 Mar 2008, Nick Piggin wrote:
> They are only crappy in SLUB. In SLAB they are actually useful.
Cannot figure out what you are talking about....
On Mon, 3 Mar 2008, Nick Piggin wrote:
> > HWCACHE_ALIGN means that you want the object to be aligned at
> > cacheline boundaries for optimization. Why does crossing cacheline
> > boundaries matter in this case?
>
> No, HWCACHE_ALIGN means that you want the object not to cross cacheline
> boundaries for at least cache_line_size() bytes. You invented new
Interesting new definition....
On Mon, 3 Mar 2008, Pekka Enberg wrote:
> Hmm... Can't we just fix SLAB_HWCACHE_ALIGN in SLUB to follow the
> semantics of SLAB?
AFAICT it follows SLAB semantics. The only small difference is for objects
small than cache_line_size() / 2 where SLUB does not bother to align to a
fraction of a cacheline since we are already placing multile object into a
cacheline. We effectively have made the decision to give up the
organization of objects in separatate cache lines.
Lets say you have a 64 byte cache line size. Then the alignment can be
as follows. (8 byte alignment is the basic alignment requirement).
Objsize [C SLAB SLUB
-----------------------------
> 64 X X
33 .. 64 64 64
32 32 32
24 32 24 -> 3 object per cacheline sizes = 72 so overlap.
16 16 16
8 8 8
So there is only one difference for 24 byte sizes slabs.
Hi,
On Mon, 3 Mar 2008, Nick Piggin wrote:
> > > HWCACHE_ALIGN means that you want the object to be aligned at
> > > cacheline boundaries for optimization. Why does crossing cacheline
> > > boundaries matter in this case?
> >
> > No, HWCACHE_ALIGN means that you want the object not to cross cacheline
> > boundaries for at least cache_line_size() bytes. You invented new
On Mon, Mar 3, 2008 at 11:16 PM, Christoph Lameter <[email protected]> wrote:
> Interesting new definition....
Well, not my definition either but SLAB has guaranteed that for small
objects in the past, so I think Nick has a point here. However, with
all this back and forth, I've lost track why this matters. I suppose
it causes regression on some workload?
Pekka
On Mon, 3 Mar 2008, Nick Piggin wrote:
> On Mon, Mar 03, 2008 at 12:17:20PM -0800, Christoph Lameter wrote:
> > On Mon, 3 Mar 2008, Nick Piggin wrote:
> >
> > > > You want two ways of specifying alignments?
> > >
> > > I want a way to ask for SMP friendly allocation.
> >
> > Then align the objects at cacheline boundaries by providing a value for
> > the align parameter to kmem_cache_create().
>
> max(num_possible_cpus() > 1 ? cache_line_size() : 0, mandatory_alignment)?
The mandatory alignment is applied anyways. You do not need to max() on
that.
One could simply specify cache_line_size() with Pekka's patch.
cache_line_size() could default to 0 if !SMP.
> Then suppose we want a CONFIG_TINY option to eliminate it?
No slab allocator supports that right now. However, SLOB in the past has
ignored the alignment in order to reduce memory use. So maybe Matt wants
to introduce this?
> And maybe the VSMP guys will want to blow this out to their internode
> alignment?
>
> max(!CONFIG_TINY && num_possible_cpus() > 1 ? (is_vsmp ? internode_alignemnt : cache_line_size()) : 0, mandatory_alignment)
No the slab allocators were optimized for VSMP so that the
internode_alignment is not necessary there. That was actually one of the
requirements that triggered the slab numa work.
On Mon, 3 Mar 2008, Pekka Enberg wrote:
> Well, not my definition either but SLAB has guaranteed that for small
> objects in the past, so I think Nick has a point here. However, with
> all this back and forth, I've lost track why this matters. I suppose
> it causes regression on some workload?
Well the guarantee can only be exploited if you would check the cacheline
sizes and the object size from the code that creates the slab cache.
Basically you would have to guestimate what the slab allocator is doing.
So the guarantee is basically meaningless. If the object is larger than a
cacheline then this will never work.
Hi,
Christoph Lameter wrote:
> Well the guarantee can only be exploited if you would check the cacheline
> sizes and the object size from the code that creates the slab cache.
> Basically you would have to guestimate what the slab allocator is doing.
>
> So the guarantee is basically meaningless. If the object is larger than a
> cacheline then this will never work.
Yes, I know that. That's why I am asking why this matters. If there's
some sort of regression because SLUB does HWCACHE_ALIGN bit differently,
we need to fix that. Not that it necessarily means we have to change
HWCACHE_ALIGN but I am assuming Nick has some reason why he wants to
introduce the SMP alignment flag.
Pekka
On Mon, Mar 03, 2008 at 01:16:44PM -0800, Christoph Lameter wrote:
> On Mon, 3 Mar 2008, Nick Piggin wrote:
>
> > > HWCACHE_ALIGN means that you want the object to be aligned at
> > > cacheline boundaries for optimization. Why does crossing cacheline
> > > boundaries matter in this case?
> >
> > No, HWCACHE_ALIGN means that you want the object not to cross cacheline
> > boundaries for at least cache_line_size() bytes. You invented new
>
> Interesting new definition....
Huh?? It is not a new definition, it is exactly what SLAB does. And
then you go and do something different and claim that you follow
what slab does.
On Mon, Mar 03, 2008 at 01:32:54PM -0800, Christoph Lameter wrote:
> On Mon, 3 Mar 2008, Pekka Enberg wrote:
>
> > Well, not my definition either but SLAB has guaranteed that for small
> > objects in the past, so I think Nick has a point here. However, with
> > all this back and forth, I've lost track why this matters. I suppose
> > it causes regression on some workload?
>
> Well the guarantee can only be exploited if you would check the cacheline
> sizes and the object size from the code that creates the slab cache.
> Basically you would have to guestimate what the slab allocator is doing.
>
> So the guarantee is basically meaningless. If the object is larger than a
> cacheline then this will never work.
Of course it works. It fits the object into the fewest number of cachelines
possible. If you need to be accessing such objects in a random manner, then
for highest performance you want to touch as few cachelines as possible.
From: Nick Piggin <[email protected]>
Date: Wed, 5 Mar 2008 01:06:37 +0100
> On Mon, Mar 03, 2008 at 01:16:44PM -0800, Christoph Lameter wrote:
> > On Mon, 3 Mar 2008, Nick Piggin wrote:
> >
> > > > HWCACHE_ALIGN means that you want the object to be aligned at
> > > > cacheline boundaries for optimization. Why does crossing cacheline
> > > > boundaries matter in this case?
> > >
> > > No, HWCACHE_ALIGN means that you want the object not to cross cacheline
> > > boundaries for at least cache_line_size() bytes. You invented new
> >
> > Interesting new definition....
>
> Huh?? It is not a new definition, it is exactly what SLAB does. And
> then you go and do something different and claim that you follow
> what slab does.
I completely agree with Nick.
On Mon, Mar 03, 2008 at 01:31:14PM -0800, Christoph Lameter wrote:
> On Mon, 3 Mar 2008, Nick Piggin wrote:
>
> > On Mon, Mar 03, 2008 at 12:17:20PM -0800, Christoph Lameter wrote:
> > > On Mon, 3 Mar 2008, Nick Piggin wrote:
> > >
> > > > > You want two ways of specifying alignments?
> > > >
> > > > I want a way to ask for SMP friendly allocation.
> > >
> > > Then align the objects at cacheline boundaries by providing a value for
> > > the align parameter to kmem_cache_create().
> >
> > max(num_possible_cpus() > 1 ? cache_line_size() : 0, mandatory_alignment)?
>
> The mandatory alignment is applied anyways. You do not need to max() on
> that.
No that's the caller's required alignment.
> One could simply specify cache_line_size() with Pekka's patch.
> cache_line_size() could default to 0 if !SMP.
That's totally wrong and stupid.
> > Then suppose we want a CONFIG_TINY option to eliminate it?
>
> No slab allocator supports that right now.
That's way I said suppose we want it. Which is not unreasonable.
> However, SLOB in the past has
> ignored the alignment in order to reduce memory use. So maybe Matt wants
> to introduce this?
>
> > And maybe the VSMP guys will want to blow this out to their internode
> > alignment?
> >
> > max(!CONFIG_TINY && num_possible_cpus() > 1 ? (is_vsmp ? internode_alignemnt : cache_line_size()) : 0, mandatory_alignment)
>
> No the slab allocators were optimized for VSMP so that the
> internode_alignment is not necessary there. That was actually one of the
> requirements that triggered the slab numa work.
What do you mean internode_alignment is not necessary there? This is
about memory returned by the allocator to the caller, and the caller
does not want any false sharing of the cacheline on SMP systems. How
can internode alignemnt be not necessary?
On Mon, Mar 03, 2008 at 11:35:44PM +0200, Pekka Enberg wrote:
> Hi,
>
> Christoph Lameter wrote:
> >Well the guarantee can only be exploited if you would check the cacheline
> >sizes and the object size from the code that creates the slab cache.
> >Basically you would have to guestimate what the slab allocator is doing.
> >
> >So the guarantee is basically meaningless. If the object is larger than a
> >cacheline then this will never work.
>
> Yes, I know that. That's why I am asking why this matters. If there's
> some sort of regression because SLUB does HWCACHE_ALIGN bit differently,
> we need to fix that.
It started out as a SLUB regression that was exposing poor code in the
percpu allocator due to different SLUB kmalloc alignments. That prompted
some further investigation about the alignment handling in the
allocators and showed up this problem with SLUB's HWCACHE_ALIGN. While
I don't know of a regression caused by it as such, it is totally
unreasonable to just change the semantics of it (seemingly for no good
reason). It is used quite a bit in networking for one, and those guys
count every single cache miss in their fastpaths.
> Not that it necessarily means we have to change
> HWCACHE_ALIGN but I am assuming Nick has some reason why he wants to
> introduce the SMP alignment flag.
The SMP flag was just an RFC. I think some people (like Christoph) were
being confused about the HWCACHE_ALIGN flag being for avoiding false
sharing on SMP systems. It would actually be also generally useful to
have the SMP flag (eg. see the sites I added it to in patch #3).
On Wed, 5 Mar 2008, Nick Piggin wrote:
> It started out as a SLUB regression that was exposing poor code in the
> percpu allocator due to different SLUB kmalloc alignments. That prompted
That was due to SLUB's support for smaller allocation sizes. AFAICT has
nothing to do with alignment.
> The SMP flag was just an RFC. I think some people (like Christoph) were
> being confused about the HWCACHE_ALIGN flag being for avoiding false
> sharing on SMP systems. It would actually be also generally useful to
> have the SMP flag (eg. see the sites I added it to in patch #3).
Hmmm. We could define a global constant for that? Determine it on bootup
and then pass it as an alignment parameter?
On Tue, 4 Mar 2008, David Miller wrote:
> > Huh?? It is not a new definition, it is exactly what SLAB does. And
> > then you go and do something different and claim that you follow
> > what slab does.
>
> I completely agree with Nick.
So you also want subalignment because of cacheline crossing for 24 byte
slabs? We then only have 2 objects per cacheline instead of 3 but no
crossing anymore.
Well okay if there are multiple requests then lets merge Nick's patch that
does this. Still think that this will do much ...
Instead of 170 we will only have 128 objects per slab (64 byte
cacheline).
It will affect the following slab caches (mm) reducing the density of
objects.
scsi_bidi_sdb numa_policy fasync_cache xfs_bmap_free_item xfs_dabuf
fstrm_item dm_target_io
Nothing related to networking....
On Wed, Mar 05, 2008 at 12:56:33PM -0800, Christoph Lameter wrote:
> On Wed, 5 Mar 2008, Nick Piggin wrote:
>
> > It started out as a SLUB regression that was exposing poor code in the
> > percpu allocator due to different SLUB kmalloc alignments. That prompted
>
> That was due to SLUB's support for smaller allocation sizes. AFAICT has
> nothing to do with alignment.
The smaller sizes meant objects were less often aligned on cacheline
boundaries.
> > The SMP flag was just an RFC. I think some people (like Christoph) were
> > being confused about the HWCACHE_ALIGN flag being for avoiding false
> > sharing on SMP systems. It would actually be also generally useful to
> > have the SMP flag (eg. see the sites I added it to in patch #3).
>
> Hmmm. We could define a global constant for that? Determine it on bootup
> and then pass it as an alignment parameter?
We could, but I'd rather just use the flag.
On Wed, Mar 05, 2008 at 01:06:30PM -0800, Christoph Lameter wrote:
> On Tue, 4 Mar 2008, David Miller wrote:
>
> > > Huh?? It is not a new definition, it is exactly what SLAB does. And
> > > then you go and do something different and claim that you follow
> > > what slab does.
> >
> > I completely agree with Nick.
>
> So you also want subalignment because of cacheline crossing for 24 byte
> slabs? We then only have 2 objects per cacheline instead of 3 but no
> crossing anymore.
>
> Well okay if there are multiple requests then lets merge Nick's patch that
> does this. Still think that this will do much ...
> Instead of 170 we will only have 128 objects per slab (64 byte
> cacheline).
That's what callers expect when they pass the HWCACHE flag. Wouldn't it
be logical to fix the callers if you think it costs too much memory with
not enough improvement?
> It will affect the following slab caches (mm) reducing the density of
> objects.
>
> scsi_bidi_sdb numa_policy fasync_cache xfs_bmap_free_item xfs_dabuf
> fstrm_item dm_target_io
>
> Nothing related to networking....
There looks like definitely some networking slabs that pass the flag
and can be non-power-of-2. And don't forget cachelines can be anywhere
up to 256 bytes in size. So yeah it definitely makes sense to merge
the patch and then examine the callers if you feel strongly about it.
On Thu, 6 Mar 2008, Nick Piggin wrote:
> > That was due to SLUB's support for smaller allocation sizes. AFAICT has
> > nothing to do with alignment.
>
> The smaller sizes meant objects were less often aligned on cacheline
> boundaries.
Right since SLAB_HWCACHE_ALIGN does not align for very small objects.
> We could, but I'd rather just use the flag.
Do you have a case in mind where that would be useful? We had a
SLAB_HWCACHE_MUST_ALIGN or so at some point but it was rarely to never
used. Note that there is also KMEM_CACHE which picks up the alignment from
the compiler.
On Thu, 6 Mar 2008, Nick Piggin wrote:
> There looks like definitely some networking slabs that pass the flag
> and can be non-power-of-2. And don't forget cachelines can be anywhere
> up to 256 bytes in size. So yeah it definitely makes sense to merge
> the patch and then examine the callers if you feel strongly about it.
Just do not like to add fluff that has basically no effect. I just tried
to improve things by not doing anything special if we cannot cacheline
align object. Least surprise (at least for me). It is bad enough that we
just decide to ignore the request for alignment for small caches.
On Thu, Mar 06, 2008 at 02:53:11PM -0800, Christoph Lameter wrote:
> On Thu, 6 Mar 2008, Nick Piggin wrote:
>
> > > That was due to SLUB's support for smaller allocation sizes. AFAICT has
> > > nothing to do with alignment.
> >
> > The smaller sizes meant objects were less often aligned on cacheline
> > boundaries.
>
> Right since SLAB_HWCACHE_ALIGN does not align for very small objects.
It doesn't align small objects to cacheline boundaries in either allocator.
The regression is just because slub can support smaller sizes of objects
AFAIKS.
> > We could, but I'd rather just use the flag.
>
> Do you have a case in mind where that would be useful? We had a
Patch 3/3
> SLAB_HWCACHE_MUST_ALIGN or so at some point but it was rarely to never
> used.
OK, but that's not the same thing.
> Note that there is also KMEM_CACHE which picks up the alignment from
> the compiler.
Yeah, that's not quite as good either. My allocation flag is dynamic, so
it will not bloat things for no reason on UP machines and SMP kernels.
It also aligns to the detected machine cacheline size rather than a
compile time constant.
On Fri, 7 Mar 2008, Nick Piggin wrote:
> > Do you have a case in mind where that would be useful? We had a
>
> Patch 3/3
Those already have SLAB_HWCACHE_ALIGN.
The point is to switch off alignment for UP? Cant we do that with
SLAB_HWCACHE_ALIGN too since SLOB seemed to work successfully without it
in the past?
> > Note that there is also KMEM_CACHE which picks up the alignment from
> > the compiler.
>
> Yeah, that's not quite as good either. My allocation flag is dynamic, so
> it will not bloat things for no reason on UP machines and SMP kernels.
> It also aligns to the detected machine cacheline size rather than a
> compile time constant.
Is that really a noteworthy effect?
On Thu, Mar 06, 2008 at 02:56:42PM -0800, Christoph Lameter wrote:
> On Thu, 6 Mar 2008, Nick Piggin wrote:
>
> > There looks like definitely some networking slabs that pass the flag
> > and can be non-power-of-2. And don't forget cachelines can be anywhere
> > up to 256 bytes in size. So yeah it definitely makes sense to merge
> > the patch and then examine the callers if you feel strongly about it.
>
> Just do not like to add fluff that has basically no effect. I just tried
> to improve things by not doing anything special if we cannot cacheline
> align object. Least surprise (at least for me). It is bad enough that we
> just decide to ignore the request for alignment for small caches.
That's just because you (apparently still) have a misconception about what
the flag is supposed to be for. It is not for aligning things to the start
of a cacheline boundary. It is not for avoiding false sharing on SMP. It
is for ensuring that a given object will span the fewest number of
cachelines. This can actually be important if you do anything like random
lookups or tree walks where the object contains the tree node.
Consider a 64 byte cacheline, and a 24 byte object:
cacheline |-------|-------|-------
object |--|--|--|--|--|--|--|--
So if you touch 8 random objects, it is statistically likely to cost you
10 cache misses (so long as the working set is sufficiently cold / larger
than cache that cacheline sharing is insignificant).
If you actually honour HWCACHE_ALIGN, then the same object will be 32
bytes:
cacheline |-------|
object |---|---|
Now 8 will cost 8. A 20% saving. Maybe almost a 20% performance improvement.
Before we go around in circles again, do you accept this? If yes, then
what is your argument that SLUB knows better than the caller; if no, then
why not?
On Thu, Mar 06, 2008 at 06:20:40PM -0800, Christoph Lameter wrote:
> On Fri, 7 Mar 2008, Nick Piggin wrote:
>
> > > Do you have a case in mind where that would be useful? We had a
> >
> > Patch 3/3
>
> Those already have SLAB_HWCACHE_ALIGN.
>
> The point is to switch off alignment for UP? Cant we do that with
> SLAB_HWCACHE_ALIGN too since SLOB seemed to work successfully without it
> in the past?
See my next post to correct your understanding of HWCACHE_ALIGN.
> > > Note that there is also KMEM_CACHE which picks up the alignment from
> > > the compiler.
> >
> > Yeah, that's not quite as good either. My allocation flag is dynamic, so
> > it will not bloat things for no reason on UP machines and SMP kernels.
> > It also aligns to the detected machine cacheline size rather than a
> > compile time constant.
>
> Is that really a noteworthy effect?
Yes.
On Fri, 7 Mar 2008, Nick Piggin wrote:
> That's just because you (apparently still) have a misconception about what
> the flag is supposed to be for. It is not for aligning things to the start
> of a cacheline boundary. It is not for avoiding false sharing on SMP. It
The alignment of the object to the start of a cacheline is the obvious
meaning and that is also reflected in the comment in slab.h.
On Fri, 7 Mar 2008, Nick Piggin wrote:
> > Is that really a noteworthy effect?
>
> Yes.
Numbers?
On Thu, Mar 06, 2008 at 06:26:49PM -0800, Christoph Lameter wrote:
> On Fri, 7 Mar 2008, Nick Piggin wrote:
>
> > That's just because you (apparently still) have a misconception about what
> > the flag is supposed to be for. It is not for aligning things to the start
> > of a cacheline boundary. It is not for avoiding false sharing on SMP. It
>
> The alignment of the object to the start of a cacheline is the obvious
> meaning and that is also reflected in the comment in slab.h.
It doesn't say start of cache line. It says align them *on* cachelines.
2 32 byte objects on a 64 byte cacheline are aligned on the cacheline.
2.67 24 bytes objects on a 64 byte cacheline are not aligned on the
cacheline.
Anyway, if you want to be myopic about it, then good luck with that.
On Thu, Mar 06, 2008 at 06:27:12PM -0800, Christoph Lameter wrote:
> On Fri, 7 Mar 2008, Nick Piggin wrote:
>
> > > Is that really a noteworthy effect?
> >
> > Yes.
>
> Numbers?
Uhh... big number > small number
On Fri, 7 Mar 2008, Nick Piggin wrote:
> On Thu, Mar 06, 2008 at 06:27:12PM -0800, Christoph Lameter wrote:
> > On Fri, 7 Mar 2008, Nick Piggin wrote:
> >
> > > > Is that really a noteworthy effect?
> > >
> > > Yes.
> >
> > Numbers?
>
> Uhh... big number > small number
Can you quantify the effect?
> It doesn't say start of cache line. It says align them *on* cachelines.
> 2 32 byte objects on a 64 byte cacheline are aligned on the cacheline.
> 2.67 24 bytes objects on a 64 byte cacheline are not aligned on the
> cacheline.
2 32 byte objects means only one is aligned on a cache line.
Certainly cacheline contention is reduced and performance potentially
increased if there are less objects in a cacheline.
The same argument can be made of aligning 8 byte objects on 32 byte
boundaries. Instead of 8 objects per cacheline you only have two. Why 8?
Isnt all of this a bit arbitrary and contrary to the intend of avoiding
cacheline contention?
The cleanest solution to this is to specify the alignment for each
slabcache if such an alignment is needed. The alignment can be just a
global constant or function like cache_line_size().
I.e. define
int smp_align;
On bootup check the number of running cpus and then pass smp_align as
the align parameter (most slabcaches have no other alignment needs, if
they do then we can combine these using max).
If we want to do more sophisticated things then lets have function that
aligns the object on power of two boundaries like SLAB_HWCACHE_ALIGN does
now:
sub_cacheline_align(size)
Doing so will make it more transparent as to what is going on and which
behavior we want. And we can get rid of SLAB_HWCACHE_ALIGN with the weird
semantics. Specifying smp_align wil truly always align on a cacheline
boundary if we are on an SMP system.
On Thu, Mar 06, 2008 at 06:54:19PM -0800, Christoph Lameter wrote:
> > It doesn't say start of cache line. It says align them *on* cachelines.
> > 2 32 byte objects on a 64 byte cacheline are aligned on the cacheline.
> > 2.67 24 bytes objects on a 64 byte cacheline are not aligned on the
> > cacheline.
>
> 2 32 byte objects means only one is aligned on a cache line.
>
> Certainly cacheline contention is reduced and performance potentially
> increased if there are less objects in a cacheline.
>
> The same argument can be made of aligning 8 byte objects on 32 byte
> boundaries. Instead of 8 objects per cacheline you only have two. Why 8?
>
> Isnt all of this a bit arbitrary and contrary to the intend of avoiding
> cacheline contention?
No, it *is not about avoiding cacheline contention*. As such, the rest
of what you wrote below about smp_align etc is rubbish.
Can you actually read what I posted?
On Fri, 7 Mar 2008, Nick Piggin wrote:
> No, it *is not about avoiding cacheline contention*. As such, the rest
> of what you wrote below about smp_align etc is rubbish.
>
> Can you actually read what I posted?
I am trying but it barely makes sense. You first state that it
is *not* about avoiding cacheline contention and then argue that it
reduces cacheline contention.
On Thu, Mar 06, 2008 at 07:18:54PM -0800, Christoph Lameter wrote:
> On Fri, 7 Mar 2008, Nick Piggin wrote:
>
> > No, it *is not about avoiding cacheline contention*. As such, the rest
> > of what you wrote below about smp_align etc is rubbish.
> >
> > Can you actually read what I posted?
>
> I am trying but it barely makes sense. You first state that it
> is *not* about avoiding cacheline contention and then argue that it
> reduces cacheline contention.
Where do I argue that it reduces cacheline contention? All arguments
I made apply to a UP system equally.
On Fri, 7 Mar 2008, Nick Piggin wrote:
> Where do I argue that it reduces cacheline contention? All arguments
> I made apply to a UP system equally.
Huh? I thought you wanted to avoid cacheline alignment on UP to save
memory?
On Thu, Mar 06, 2008 at 07:58:06PM -0800, Christoph Lameter wrote:
> On Fri, 7 Mar 2008, Nick Piggin wrote:
>
> > Where do I argue that it reduces cacheline contention? All arguments
> > I made apply to a UP system equally.
>
> Huh? I thought you wanted to avoid cacheline alignment on UP to save
> memory?
We're talking about HWCACHE_ALIGN in this thread. SMP_ALIGN is for
reducing cacheline contention and that's where you can avoid it on UP.
On Mon, Mar 03, 2008 at 01:31:14PM -0800, Christoph Lameter wrote:
> On Mon, 3 Mar 2008, Nick Piggin wrote:
>
> > And maybe the VSMP guys will want to blow this out to their internode
> > alignment?
> >
> > max(!CONFIG_TINY && num_possible_cpus() > 1 ? (is_vsmp ? internode_alignemnt : cache_line_size()) : 0, mandatory_alignment)
>
> No the slab allocators were optimized for VSMP so that the
> internode_alignment is not necessary there. That was actually one of the
> requirements that triggered the slab numa work.
BTW. can you explain this incredible claim that the slab allocators
do not need internode_alignment to avoid false sharing of allocated
objects on VSMP? I don't understand how that would work at all.
On Fri, 7 Mar 2008, Nick Piggin wrote:
> > No the slab allocators were optimized for VSMP so that the
> > internode_alignment is not necessary there. That was actually one of the
> > requirements that triggered the slab numa work.
>
> BTW. can you explain this incredible claim that the slab allocators
> do not need internode_alignment to avoid false sharing of allocated
> objects on VSMP? I don't understand how that would work at all.
The queues are per node which means that pages from which we allocate are
distinct per node. As long as processes allocate and free object on a
single node all is fine. Problems can arise though if objects are used
across node. The earlier incarnation of SLAB had a single global queue for
slab pages and objects could be allocated from an arbitrary processor.
Objects from the same page were allocated from all processors.
On Thu, Mar 06, 2008 at 09:11:39PM -0800, Christoph Lameter wrote:
> On Fri, 7 Mar 2008, Nick Piggin wrote:
>
> > > No the slab allocators were optimized for VSMP so that the
> > > internode_alignment is not necessary there. That was actually one of the
> > > requirements that triggered the slab numa work.
> >
> > BTW. can you explain this incredible claim that the slab allocators
> > do not need internode_alignment to avoid false sharing of allocated
> > objects on VSMP? I don't understand how that would work at all.
>
> The queues are per node which means that pages from which we allocate are
> distinct per node. As long as processes allocate and free object on a
> single node all is fine. Problems can arise though if objects are used
> across node.
This does not solve the problem. Say if you have 2 objects allocated
from a given slab and you want to avoid cacheline contention between
them, then you have to always ensure they don't overlap on the same
cacheline. The fact this happens naturally when you allocate 2 objects
from 2 different nodes doesn't really help: the same single CPU can
allocate objects that you want to avoid false sharing with. Consider
a task struct for example: if you start up a whole lot of worker threads
from a single CPU, then you will allocate them all from that one CPU.
Then they are spread over all CPUs and you get cacheline contention.
Which is why VSMP would legitimately want to use internode alignment
on some structures. And internode alignment is total overkill on any
other type of system, so you can't go around annotating all your
structures with it and hope KMEM_CACHE does the right thing.
On Thu, Mar 06, 2008 at 06:33:35PM -0800, Christoph Lameter wrote:
> On Fri, 7 Mar 2008, Nick Piggin wrote:
>
> > On Thu, Mar 06, 2008 at 06:27:12PM -0800, Christoph Lameter wrote:
> > > On Fri, 7 Mar 2008, Nick Piggin wrote:
> > >
> > > > > Is that really a noteworthy effect?
> > > >
> > > > Yes.
> > >
> > > Numbers?
> >
> > Uhh... big number > small number
>
> Can you quantify the effect?
It obviously won't be a great deal until if/when it starts getting
used more. The space savings on UP are just one reason why my approach
is better than using the static alignment annotations.
Internode alignment is 4k. If you need an object aligned like that then it
may be better to go to the page allocator.
On Thu, Mar 06, 2008 at 09:26:24PM -0800, Christoph Lameter wrote:
> Internode alignment is 4k. If you need an object aligned like that then it
> may be better to go to the page allocator.
But you don't need an object aligned like that unless you are VSMP.
On Thu, Mar 06, 2008 at 09:26:24PM -0800, Christoph Lameter wrote:
> Internode alignment is 4k. If you need an object aligned like that then it
> may be better to go to the page allocator.
I never got a reply about this. Again: how can the modern slab allocator
avoid the need for internode alignment in order to prevent cacheline
pingpong on vsmp? Because I'm going to resubmit the SMP_ALIGN patch.
Also, the last I heard about the HWCACHE_ALIGN from you is that it didn't
make sense... I see it's merged now, which I appreciate, but I think it is
really important that we're on the same page here, so let me know if it
still doesn't make sense.
Thanks,
Nick
On Tue, 11 Mar 2008, Nick Piggin wrote:
> On Thu, Mar 06, 2008 at 09:26:24PM -0800, Christoph Lameter wrote:
> > Internode alignment is 4k. If you need an object aligned like that then it
> > may be better to go to the page allocator.
>
> I never got a reply about this. Again: how can the modern slab allocator
> avoid the need for internode alignment in order to prevent cacheline
> pingpong on vsmp? Because I'm going to resubmit the SMP_ALIGN patch.
You quoted my answer. Use the page allocator to allocate data that has an
alignment requirement on a 4k boundary.
> Also, the last I heard about the HWCACHE_ALIGN from you is that it didn't
> make sense... I see it's merged now, which I appreciate, but I think it is
> really important that we're on the same page here, so let me know if it
> still doesn't make sense.
I still think that the semantics are weird because it only works sometimes
and then performs an alignment within a cacheline that improves the
situation somewhat in some cases but does not give the user what he
expected.