Hi
slab debug detects the corruption when it allocates and frees the slab. But if someone write to slab memory when it already freed, slab debug could only report its last free point when slab allocates the same block.
This information is not enough for debugging when system has complex memory usage. gcc mudflap could insert some check point when a memory is writing and it is helpful to check if the address is writing freed.
Because most codes of kernel are drivers, it is most possible that drivers have such problem. So I use mudflap to compile all drivers.
kmemcheck is also helpful to check same issue, and it triggers trap every reading and writing, and it need to analysis the instruct which trigger the trap to get the memory address.
This heavy depends on the cpu capability. arm has no single-step like x86, so kmemcheck is too heavy to run on real arm system. mudflap way is lighter than kmemcheck for this purpose.
As stack allocation is useless for checking slab, kernel mudflap does not implement alloc which is required by gcc mudflap when codes use variable array. So variable array, such as array[i] which i could not determined by compiler, is not supported.
Thanks a lot for comments!
Signed-off-by: janboe <[email protected]>
arch/arm/include/asm/elf.h | 1 +
arch/arm/kernel/module.c | 1 +
drivers/Makefile | 4 +-
include/mf-runtime.h | 42 ++++++++++++++
kernel/Makefile | 2 +-
kernel/mudflap.c | 132 ++++++++++++++++++++++++++++++++++++++++++++
lib/Kconfig.debug | 7 ++
mm/slab.c | 53 ++++++++++++++++++
8 files changed, 240 insertions(+), 2 deletions(-)
diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h
index c207504..30cb4b7 100644
--- a/arch/arm/include/asm/elf.h
+++ b/arch/arm/include/asm/elf.h
@@ -50,6 +50,7 @@ typedef struct user_fp elf_fpregset_t;
#define R_ARM_ABS32 2
#define R_ARM_CALL 28
#define R_ARM_JUMP24 29
+#define R_ARM_TARGET1 38
#define R_ARM_V4BX 40
#define R_ARM_PREL31 42
#define R_ARM_MOVW_ABS_NC 43
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index bac03c8..8cfe5f5 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -128,6 +128,7 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
break;
case R_ARM_ABS32:
+ case R_ARM_TARGET1:
*(u32 *)loc += sym->st_value;
break;
diff --git a/drivers/Makefile b/drivers/Makefile
index bc4205d..d5e5063 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -4,7 +4,9 @@
# 15 Sep 2000, Christoph Hellwig <[email protected]>
# Rewritten to use lists instead of if-statements.
#
-
+ifdef CONFIG_MUDFLAP
+subdir-ccflags-y += -fmudflap -fmudflapir
+endif
obj-y += gpio/
obj-$(CONFIG_PCI) += pci/
obj-$(CONFIG_PARISC) += parisc/
diff --git a/include/mf-runtime.h b/include/mf-runtime.h
new file mode 100644
index 0000000..4639d9d
--- /dev/null
+++ b/include/mf-runtime.h
@@ -0,0 +1,42 @@
+#ifndef _LINUX_MF_RUNTIME
+#define _LINUX_MF_RUNTIME
+/*
+ * Copyright (C) 2009 Motorola Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+#include<linux/kernel.h>
+
+#define LOOKUP_CACHE_MASK_DFL 1023
+#define LOOKUP_CACHE_SIZE_MAX 65536 /* Allows max CACHE_MASK 0xFFFF */
+#define LOOKUP_CACHE_SHIFT_DFL 2
+typedef void *__mf_ptr_t;
+typedef unsigned int __mf_uintptr_t;
+struct __mf_cache { __mf_uintptr_t low; __mf_uintptr_t high; };
+void __mf_check(void *ptr, unsigned int sz, int type, const char *location);
+void __mf_register(void *ptr, size_t sz, int type, const char *name);
+void __mf_init(void);
+void __mf_unregister(void *ptr, size_t sz, int type);
+#ifdef _MUDFLAP
+#pragma redefine_extname memcpy __mfwrap_memcpy
+#pragma redefine_extname memset __mfwrap_memset
+#pragma redefine_extname strcpy __mfwrap_strcpy
+#pragma redefine_extname strncpy __mfwrap_strncpy
+#pragma redefine_extname __memzero __mfwrap_memzero
+#endif /* _MUDFLAP */
+
+#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 2093a69..c4c2402 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -22,7 +22,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_sched_clock.o = -pg
endif
-
+obj-$(CONFIG_MUDFLAP) += mudflap.o
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
diff --git a/kernel/mudflap.c b/kernel/mudflap.c
new file mode 100644
index 0000000..d0331c1
--- /dev/null
+++ b/kernel/mudflap.c
@@ -0,0 +1,132 @@
+/*
+ * kernel/mudflap.c
+ *
+ * Copyright (C) 2009 Motorola Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include<linux/kernel.h>
+#include<linux/slab.h>
+#include<linux/module.h>
+#include<linux/mm.h>
+#undef DEBUG /* define this for verbose EDID parsing output */
+
+#ifdef DEBUG
+#define DPRINTK(fmt, args...) printk(fmt, ## args)
+#else
+#define DPRINTK(fmt, args...)
+#endif
+
+/* Codes to describe the type of access to check */
+#define __MF_CHECK_READ 0
+#define __MF_CHECK_WRITE 1
+
+typedef void *__mf_ptr_t;
+typedef unsigned int __mf_uintptr_t;
+struct __mf_cache { __mf_uintptr_t low; __mf_uintptr_t high; };
+#define LOOKUP_CACHE_MASK_DFL 1023
+#define LOOKUP_CACHE_SIZE_MAX 65536 /* Allows max CACHE_MASK 0xFFFF */
+#define LOOKUP_CACHE_SHIFT_DFL 2
+
+struct __mf_cache __mf_lookup_cache[LOOKUP_CACHE_SIZE_MAX];
+EXPORT_SYMBOL(__mf_lookup_cache);
+
+uintptr_t __mf_lc_mask = LOOKUP_CACHE_MASK_DFL;
+EXPORT_SYMBOL(__mf_lc_mask);
+
+unsigned char __mf_lc_shift = LOOKUP_CACHE_SHIFT_DFL;
+EXPORT_SYMBOL(__mf_lc_shift);
+
+void check_slab_write(void *ptr, unsigned int sz,
+ int type, const char *location);
+
+static inline int verify_ptr(unsigned long ptr)
+{
+ if (ptr < PAGE_OFFSET ||
+ (ptr > (unsigned long)high_memory && high_memory != 0))
+ return -EFAULT;
+
+ return 0;
+}
+
+void __mf_check(void *ptr, unsigned int sz, int type, const char *location)
+{
+ if (verify_ptr((unsigned long)ptr))
+ return;
+ if (type) /* write */
+ check_slab_write(ptr, sz, type, location);
+}
+EXPORT_SYMBOL(__mf_check);
+
+void *__mfwrap_memset(void *s, int c, size_t count)
+{
+ __mf_check(s, count, __MF_CHECK_WRITE, "memset dest");
+ return memset(s, c, count);
+}
+EXPORT_SYMBOL(__mfwrap_memset);
+
+void *__mfwrap_memcpy(void *dest, const void *src, size_t count)
+{
+ __mf_check(dest, count, __MF_CHECK_WRITE, "memcpy dest");
+ return memcpy(dest, src, count);
+}
+EXPORT_SYMBOL(__mfwrap_memcpy);
+
+char *__mfwrap_strcpy(char *dest, const char *src)
+{
+ size_t n = strlen(src);
+ __mf_check(dest, n, __MF_CHECK_WRITE, "strcpy dest");
+ return strcpy(dest, src);
+}
+EXPORT_SYMBOL(__mfwrap_strcpy);
+
+void *__mfwrap_strncpy(char *dest, const char *src, size_t count)
+{
+ size_t len = strnlen(src, count);
+ __mf_check(dest, len, __MF_CHECK_WRITE, "strncpy dest");
+ return strncpy(dest, src, count);
+}
+EXPORT_SYMBOL(__mfwrap_strncpy);
+
+#ifdef CONFIG_ARM
+void __mfwrap_memzero(void *ptr, __kernel_size_t n)
+{
+ __mf_check(ptr, n, __MF_CHECK_WRITE, "memzero dest");
+ return __memzero(ptr, n);
+}
+EXPORT_SYMBOL(__mfwrap_memzero);
+#endif
+
+void
+__mf_register(void *ptr, size_t sz, int type, const char *name)
+{
+ DPRINTK("ptr: %p, sz: %d, type: %d, %s\n", ptr, sz, type, name);
+}
+EXPORT_SYMBOL(__mf_register);
+
+void
+__mf_unregister(void *ptr, size_t sz, int type)
+{
+ DPRINTK("ptr: %p, sz: %d, type: %d\n", ptr, sz, type);
+}
+EXPORT_SYMBOL(__mf_unregister);
+
+void __mf_init(void)
+{
+}
+EXPORT_SYMBOL(__mf_init);
+
+int
+__mf_set_options(const char *optstr)
+{
+ return 0;
+}
+EXPORT_SYMBOL(__mf_set_options);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 12327b2..6f782e7 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -35,6 +35,13 @@ config FRAME_WARN
Setting this too low will cause a lot of warnings.
Setting it to 0 disables the warning.
Requires gcc 4.4
+config MUDFLAP
+ bool "Enable mudflap to check slab write after free"
+ depends on SLAB && DEBUG_SLAB
+ help
+ This uses mudflap which introduced by gcc4 to check slab corruption
+ which is caused by writing after free.
+ This is smaller and faster than kmemcheck.
config MAGIC_SYSRQ
bool "Magic SysRq key"
diff --git a/mm/slab.c b/mm/slab.c
index 7b5d4de..bb0d57c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4503,3 +4503,56 @@ size_t ksize(const void *objp)
return obj_size(virt_to_cache(objp));
}
EXPORT_SYMBOL(ksize);
+
+#ifdef CONFIG_MUDFLAP
+void check_slab_write(void *ptr, unsigned int sz, int type,
+ const char *location)
+{
+ struct page *page;
+ struct kmem_cache *cachep;
+ struct slab *slabp;
+ char *realobj;
+ void *objp;
+ int objnr, size, i = 0, count, free_count = 0;
+ page = virt_to_page(ptr);
+ page = compound_head(page);
+ if (PageSlab(page)) {
+ cachep = page_get_cache(page);
+ slabp = page_get_slab(page);
+ objnr = (unsigned)(ptr - slabp->s_mem) / cachep->buffer_size;
+ objp = index_to_obj(cachep, slabp, objnr);
+ size = obj_size(cachep);
+ count = size - 1;
+ realobj = (char *)objp + obj_offset(cachep);
+ while (count--) {
+ i++;
+ if (realobj[i] == POISON_FREE)
+ free_count++;
+ else
+ break;
+ }
+
+ if (i == free_count) {
+ printk(KERN_ERR "ptr: %p, sz: %d, location: %s\n",
+ ptr, sz, location);
+ printk(KERN_ERR "write to freed object, size %d\n",
+ size);
+ i = 0;
+ while (size--) {
+ printk(KERN_ERR "%x", realobj[i]);
+ i++;
+ }
+ printk(KERN_ERR "%x\n", realobj[i]);
+ if (cachep->flags & SLAB_STORE_USER) {
+ printk(KERN_ERR "Last user: [<%p>]",
+ *dbg_userword(cachep, objp));
+ print_symbol("(%s)",
+ (unsigned long)*dbg_userword(cachep, objp));
+ printk("\n");
+ }
+
+ dump_stack();
+ }
+ }
+}
+#endif
Hi Janboe,
On Thu, Jul 9, 2009 at 7:13 PM, Janboe Ye<[email protected]> wrote:
> slab debug detects the corruption when it allocates and frees the slab. But if someone write to slab memory when it already freed, slab debug could only report its last free point when slab allocates the same block.
>
> This information is not enough for debugging when system has complex memory usage. gcc mudflap could insert some check point when a memory is writing and it is helpful to check if the address is writing freed.
>
> Because most codes of kernel are drivers, it is most possible that drivers have such problem. So I use mudflap to compile all drivers.
>
> kmemcheck is also helpful to check same issue, and it triggers trap every reading and writing, and it need to analysis the instruct which trigger the trap to get the memory address.
> This heavy depends on the cpu capability. arm has no single-step like x86, so kmemcheck is too heavy to run on real arm system. mudflap way is lighter than kmemcheck for this purpose.
>
> As stack allocation is useless for checking slab, kernel mudflap does not implement alloc which is required by gcc mudflap when codes use variable array. So variable array, such as array[i] which i could not determined by compiler, is not supported.
>
> Thanks a lot for comments!
Yeah, something like this makes sense. Some comments below.
> ?arch/arm/include/asm/elf.h | ? ?1 +
> ?arch/arm/kernel/module.c ? | ? ?1 +
> ?drivers/Makefile ? ? ? ? ? | ? ?4 +-
> ?include/mf-runtime.h ? ? ? | ? 42 ++++++++++++++
> ?kernel/Makefile ? ? ? ? ? ?| ? ?2 +-
> ?kernel/mudflap.c ? ? ? ? ? | ?132 ++++++++++++++++++++++++++++++++++++++++++++
> ?lib/Kconfig.debug ? ? ? ? ?| ? ?7 ++
> ?mm/slab.c ? ? ? ? ? ? ? ? ?| ? 53 ++++++++++++++++++
> ?8 files changed, 240 insertions(+), 2 deletions(-)
SLAB is (slowly) going away so you might want to port this to SLUB as
well so we can merge both.
> diff --git a/include/mf-runtime.h b/include/mf-runtime.h
> new file mode 100644
> index 0000000..4639d9d
> --- /dev/null
> +++ b/include/mf-runtime.h
> @@ -0,0 +1,42 @@
> +#ifndef _LINUX_MF_RUNTIME
> +#define _LINUX_MF_RUNTIME
> +/*
> + * Copyright (C) 2009 Motorola Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ?See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + */
> +
> +#include<linux/kernel.h>
> +
> +#define LOOKUP_CACHE_MASK_DFL 1023
> +#define LOOKUP_CACHE_SIZE_MAX 65536 /* Allows max CACHE_MASK 0xFFFF */
> +#define LOOKUP_CACHE_SHIFT_DFL 2
Newline here, please. Otherwise this get pretty messy.
> +typedef void *__mf_ptr_t;
> +typedef unsigned int __mf_uintptr_t;
> +struct __mf_cache { __mf_uintptr_t low; __mf_uintptr_t high; };
Ditto.
> +void __mf_check(void *ptr, unsigned int sz, int type, const char *location);
> +void __mf_register(void *ptr, size_t sz, int type, const char *name);
> +void __mf_init(void);
> +void __mf_unregister(void *ptr, size_t sz, int type);
Ditto.
> +#ifdef _MUDFLAP
> +#pragma redefine_extname memcpy __mfwrap_memcpy
> +#pragma redefine_extname memset __mfwrap_memset
> +#pragma redefine_extname strcpy __mfwrap_strcpy
> +#pragma redefine_extname strncpy __mfwrap_strncpy
> +#pragma redefine_extname __memzero __mfwrap_memzero
> +#endif /* _MUDFLAP */
Hmm, I don't know mudflap but how is this supposed to work now? Is
someone automatically including the header file or do you need to
include it to get memcpy() and friends redefined?
> diff --git a/kernel/mudflap.c b/kernel/mudflap.c
> new file mode 100644
> index 0000000..d0331c1
> --- /dev/null
> +++ b/kernel/mudflap.c
> @@ -0,0 +1,132 @@
> +/*
> + * kernel/mudflap.c
> + *
> + * Copyright (C) 2009 Motorola Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ?See the GNU
> + * General Public License for more details.
> + */
> +
> +#include<linux/kernel.h>
> +#include<linux/slab.h>
> +#include<linux/module.h>
> +#include<linux/mm.h>
> +#undef DEBUG ?/* define this for verbose EDID parsing output */
EDID? Anyway, either make a proper CONFIG_MUDFLAP_DEBUG or turn the
thing into a run-time option that's passed as kernel option at boot.
> +/* Codes to describe the type of access to check */
> +#define __MF_CHECK_READ 0
> +#define __MF_CHECK_WRITE 1
> +
> +typedef void *__mf_ptr_t;
> +typedef unsigned int __mf_uintptr_t;
> +struct __mf_cache { __mf_uintptr_t low; __mf_uintptr_t high; };
Newline
> +#define LOOKUP_CACHE_MASK_DFL 1023
> +#define LOOKUP_CACHE_SIZE_MAX 65536 /* Allows max CACHE_MASK 0xFFFF */
> +#define LOOKUP_CACHE_SHIFT_DFL 2
> +
I wonder if the above should go into mudflap-runtime.h
> +struct __mf_cache __mf_lookup_cache[LOOKUP_CACHE_SIZE_MAX];
> +EXPORT_SYMBOL(__mf_lookup_cache);
> +
> +uintptr_t __mf_lc_mask = LOOKUP_CACHE_MASK_DFL;
> +EXPORT_SYMBOL(__mf_lc_mask);
> +
> +unsigned char __mf_lc_shift = LOOKUP_CACHE_SHIFT_DFL;
> +EXPORT_SYMBOL(__mf_lc_shift);
> +
> +void check_slab_write(void *ptr, unsigned int sz,
> + ? ? ? ? ? ? ? int type, const char *location);
You probably want to move this into <linux/slab_def.h>
> +
> +static inline int verify_ptr(unsigned long ptr)
> +{
> + ? ? ? if (ptr < PAGE_OFFSET ||
> + ? ? ? ? ? ? ? (ptr > (unsigned long)high_memory && high_memory != 0))
> + ? ? ? ? ? ? ? return -EFAULT;
> +
> + ? ? ? return 0;
> +}
OK, so what is this function supposed to do?
> +
> +void __mf_check(void *ptr, unsigned int sz, int type, const char *location)
> +{
> + ? ? ? if (verify_ptr((unsigned long)ptr))
> + ? ? ? ? ? ? ? return;
> + ? ? ? if (type) /* write */
> + ? ? ? ? ? ? ? check_slab_write(ptr, sz, type, location);
> +}
So why are we passing type to check_slab_write() as it's clearly
always __MF_CHECK_WRITE? Do we need 'location'? Isn't that obvious
from the stack trace?
> +#ifdef CONFIG_ARM
> +void __mfwrap_memzero(void *ptr, __kernel_size_t n)
> +{
> + ? ? ? __mf_check(ptr, n, __MF_CHECK_WRITE, "memzero dest");
> + ? ? ? return __memzero(ptr, n);
> +}
> +EXPORT_SYMBOL(__mfwrap_memzero);
> +#endif
Hmm, is there some HAVE_MEMZERO that we can use here? CONFIG_ARM isn't
really suitable for core code like this so you might want to move that
to arch/arm or do the HAVE_MEMZERO thing.
> diff --git a/mm/slab.c b/mm/slab.c
> index 7b5d4de..bb0d57c 100644
> --- a/mm/slab.c
> +++ b/mm/slab.c
> @@ -4503,3 +4503,56 @@ size_t ksize(const void *objp)
> ? ? ? ?return obj_size(virt_to_cache(objp));
> ?}
> ?EXPORT_SYMBOL(ksize);
> +
> +#ifdef CONFIG_MUDFLAP
> +void check_slab_write(void *ptr, unsigned int sz, int type,
> + ? ? ? ? ? ? ? const char *location)
> +{
> + ? ? ? struct page *page;
> + ? ? ? struct kmem_cache *cachep;
> + ? ? ? struct slab *slabp;
> + ? ? ? char *realobj;
> + ? ? ? void *objp;
> + ? ? ? int objnr, size, i = 0, count, free_count = 0;
Newline, please
> + ? ? ? page = virt_to_page(ptr);
> + ? ? ? page = compound_head(page);
Why not virt_to_head_page() ?
> + ? ? ? if (PageSlab(page)) {
You could do
if (!PageSlab(page))
return;
to reduce nesting here
> + ? ? ? ? ? ? ? cachep = page_get_cache(page);
> + ? ? ? ? ? ? ? slabp = page_get_slab(page);
> + ? ? ? ? ? ? ? objnr = (unsigned)(ptr - slabp->s_mem) / cachep->buffer_size;
> + ? ? ? ? ? ? ? objp = index_to_obj(cachep, slabp, objnr);
> + ? ? ? ? ? ? ? size = obj_size(cachep);
> + ? ? ? ? ? ? ? count = size - 1;
> + ? ? ? ? ? ? ? realobj = (char *)objp + obj_offset(cachep);
> + ? ? ? ? ? ? ? while (count--) {
> + ? ? ? ? ? ? ? ? ? ? ? i++;
> + ? ? ? ? ? ? ? ? ? ? ? if (realobj[i] == POISON_FREE)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? free_count++;
> + ? ? ? ? ? ? ? ? ? ? ? else
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? break;
> + ? ? ? ? ? ? ? }
Hmm, why not use slab_bufctl() and checking whether it's BUFCTL_FREE?
> +
> + ? ? ? ? ? ? ? if (i == free_count) {
> + ? ? ? ? ? ? ? ? ? ? ? printk(KERN_ERR "ptr: %p, sz: %d, location: %s\n",
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ptr, sz, location);
> + ? ? ? ? ? ? ? ? ? ? ? printk(KERN_ERR "write to freed object, size %d\n",
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? size);
> + ? ? ? ? ? ? ? ? ? ? ? i = 0;
> + ? ? ? ? ? ? ? ? ? ? ? while (size--) {
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? printk(KERN_ERR "%x", realobj[i]);
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? i++;
> + ? ? ? ? ? ? ? ? ? ? ? }
> + ? ? ? ? ? ? ? ? ? ? ? printk(KERN_ERR "%x\n", realobj[i]);
> + ? ? ? ? ? ? ? ? ? ? ? if (cachep->flags & SLAB_STORE_USER) {
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? printk(KERN_ERR "Last user: [<%p>]",
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? *dbg_userword(cachep, objp));
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? print_symbol("(%s)",
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? (unsigned long)*dbg_userword(cachep, objp));
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? printk("\n");
> + ? ? ? ? ? ? ? ? ? ? ? }
> +
> + ? ? ? ? ? ? ? ? ? ? ? dump_stack();
> + ? ? ? ? ? ? ? }
> + ? ? ? }
> +}
> +#endif
Pekka
Pekka
Thanks for comments.
> someone automatically including the header file or do you need to
> include it to get memcpy() and friends redefined?
[Janboe] yes, gcc will do this.
> +
> > +static inline int verify_ptr(unsigned long ptr)
> > +{
> > + if (ptr < PAGE_OFFSET ||
> > + (ptr > (unsigned long)high_memory && high_memory != 0))
> > + return -EFAULT;
> > +
> > + return 0;
> > +}
>
> OK, so what is this function supposed to do?
>
[Janboe] mudflap will insert checking code for all memory access, so it need to skip the address which is io memory,
> > +#define LOOKUP_CACHE_MASK_DFL 1023
> > +#define LOOKUP_CACHE_SIZE_MAX 65536 /* Allows max CACHE_MASK 0xFFFF */
> > +#define LOOKUP_CACHE_SHIFT_DFL 2
> > +
>
> I wonder if the above should go into mudflap-runtime.h
[Janboe] these codes are only for matching mudflap requirement to finish compilation.
Hmm, is there some HAVE_MEMZERO that we can use here? CONFIG_ARM isn't
> really suitable for core code like this so you might want to move that
> to arch/arm or do the HAVE_MEMZERO thing.
[Janboe] No. ARM does not define such macro. Hmm, I just want to touch as less file as possible.
All other comments are accepted. I will send out another version for review later.
Janboe
On Thu, 2009-07-09 at 19:44 +0300, Pekka Enberg wrote:
> Hi Janboe,
>
> On Thu, Jul 9, 2009 at 7:13 PM, Janboe Ye<[email protected]> wrote:
> > slab debug detects the corruption when it allocates and frees the slab. But if someone write to slab memory when it already freed, slab debug could only report its last free point when slab allocates the same block.
> >
> > This information is not enough for debugging when system has complex memory usage. gcc mudflap could insert some check point when a memory is writing and it is helpful to check if the address is writing freed.
> >
> > Because most codes of kernel are drivers, it is most possible that drivers have such problem. So I use mudflap to compile all drivers.
> >
> > kmemcheck is also helpful to check same issue, and it triggers trap every reading and writing, and it need to analysis the instruct which trigger the trap to get the memory address.
> > This heavy depends on the cpu capability. arm has no single-step like x86, so kmemcheck is too heavy to run on real arm system. mudflap way is lighter than kmemcheck for this purpose.
> >
> > As stack allocation is useless for checking slab, kernel mudflap does not implement alloc which is required by gcc mudflap when codes use variable array. So variable array, such as array[i] which i could not determined by compiler, is not supported.
> >
> > Thanks a lot for comments!
>
> Yeah, something like this makes sense. Some comments below.
>
> > arch/arm/include/asm/elf.h | 1 +
> > arch/arm/kernel/module.c | 1 +
> > drivers/Makefile | 4 +-
> > include/mf-runtime.h | 42 ++++++++++++++
> > kernel/Makefile | 2 +-
> > kernel/mudflap.c | 132 ++++++++++++++++++++++++++++++++++++++++++++
> > lib/Kconfig.debug | 7 ++
> > mm/slab.c | 53 ++++++++++++++++++
> > 8 files changed, 240 insertions(+), 2 deletions(-)
>
> SLAB is (slowly) going away so you might want to port this to SLUB as
> well so we can merge both.
>
> > diff --git a/include/mf-runtime.h b/include/mf-runtime.h
> > new file mode 100644
> > index 0000000..4639d9d
> > --- /dev/null
> > +++ b/include/mf-runtime.h
> > @@ -0,0 +1,42 @@
> > +#ifndef _LINUX_MF_RUNTIME
> > +#define _LINUX_MF_RUNTIME
> > +/*
> > + * Copyright (C) 2009 Motorola Inc.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, write to the Free Software
> > + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> > + *
> > + */
> > +
> > +#include<linux/kernel.h>
> > +
> > +#define LOOKUP_CACHE_MASK_DFL 1023
> > +#define LOOKUP_CACHE_SIZE_MAX 65536 /* Allows max CACHE_MASK 0xFFFF */
> > +#define LOOKUP_CACHE_SHIFT_DFL 2
>
> Newline here, please. Otherwise this get pretty messy.
>
> > +typedef void *__mf_ptr_t;
> > +typedef unsigned int __mf_uintptr_t;
> > +struct __mf_cache { __mf_uintptr_t low; __mf_uintptr_t high; };
>
> Ditto.
>
> > +void __mf_check(void *ptr, unsigned int sz, int type, const char *location);
> > +void __mf_register(void *ptr, size_t sz, int type, const char *name);
> > +void __mf_init(void);
> > +void __mf_unregister(void *ptr, size_t sz, int type);
>
> Ditto.
>
> > +#ifdef _MUDFLAP
> > +#pragma redefine_extname memcpy __mfwrap_memcpy
> > +#pragma redefine_extname memset __mfwrap_memset
> > +#pragma redefine_extname strcpy __mfwrap_strcpy
> > +#pragma redefine_extname strncpy __mfwrap_strncpy
> > +#pragma redefine_extname __memzero __mfwrap_memzero
> > +#endif /* _MUDFLAP */
>
> Hmm, I don't know mudflap but how is this supposed to work now? Is
> someone automatically including the header file or do you need to
> include it to get memcpy() and friends redefined?
>
> > diff --git a/kernel/mudflap.c b/kernel/mudflap.c
> > new file mode 100644
> > index 0000000..d0331c1
> > --- /dev/null
> > +++ b/kernel/mudflap.c
> > @@ -0,0 +1,132 @@
> > +/*
> > + * kernel/mudflap.c
> > + *
> > + * Copyright (C) 2009 Motorola Inc.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + * General Public License for more details.
> > + */
> > +
> > +#include<linux/kernel.h>
> > +#include<linux/slab.h>
> > +#include<linux/module.h>
> > +#include<linux/mm.h>
> > +#undef DEBUG /* define this for verbose EDID parsing output */
>
> EDID? Anyway, either make a proper CONFIG_MUDFLAP_DEBUG or turn the
> thing into a run-time option that's passed as kernel option at boot.
>
> > +/* Codes to describe the type of access to check */
> > +#define __MF_CHECK_READ 0
> > +#define __MF_CHECK_WRITE 1
> > +
> > +typedef void *__mf_ptr_t;
> > +typedef unsigned int __mf_uintptr_t;
> > +struct __mf_cache { __mf_uintptr_t low; __mf_uintptr_t high; };
>
> Newline
>
> > +#define LOOKUP_CACHE_MASK_DFL 1023
> > +#define LOOKUP_CACHE_SIZE_MAX 65536 /* Allows max CACHE_MASK 0xFFFF */
> > +#define LOOKUP_CACHE_SHIFT_DFL 2
> > +
>
> I wonder if the above should go into mudflap-runtime.h
>
> > +struct __mf_cache __mf_lookup_cache[LOOKUP_CACHE_SIZE_MAX];
> > +EXPORT_SYMBOL(__mf_lookup_cache);
> > +
> > +uintptr_t __mf_lc_mask = LOOKUP_CACHE_MASK_DFL;
> > +EXPORT_SYMBOL(__mf_lc_mask);
> > +
> > +unsigned char __mf_lc_shift = LOOKUP_CACHE_SHIFT_DFL;
> > +EXPORT_SYMBOL(__mf_lc_shift);
> > +
> > +void check_slab_write(void *ptr, unsigned int sz,
> > + int type, const char *location);
>
> You probably want to move this into <linux/slab_def.h>
>
> > +
> > +static inline int verify_ptr(unsigned long ptr)
> > +{
> > + if (ptr < PAGE_OFFSET ||
> > + (ptr > (unsigned long)high_memory && high_memory != 0))
> > + return -EFAULT;
> > +
> > + return 0;
> > +}
>
> OK, so what is this function supposed to do?
>
> > +
> > +void __mf_check(void *ptr, unsigned int sz, int type, const char *location)
> > +{
> > + if (verify_ptr((unsigned long)ptr))
> > + return;
> > + if (type) /* write */
> > + check_slab_write(ptr, sz, type, location);
> > +}
>
> So why are we passing type to check_slab_write() as it's clearly
> always __MF_CHECK_WRITE? Do we need 'location'? Isn't that obvious
> from the stack trace?
>
> > +#ifdef CONFIG_ARM
> > +void __mfwrap_memzero(void *ptr, __kernel_size_t n)
> > +{
> > + __mf_check(ptr, n, __MF_CHECK_WRITE, "memzero dest");
> > + return __memzero(ptr, n);
> > +}
> > +EXPORT_SYMBOL(__mfwrap_memzero);
> > +#endif
>
> Hmm, is there some HAVE_MEMZERO that we can use here? CONFIG_ARM isn't
> really suitable for core code like this so you might want to move that
> to arch/arm or do the HAVE_MEMZERO thing.
>
> > diff --git a/mm/slab.c b/mm/slab.c
> > index 7b5d4de..bb0d57c 100644
> > --- a/mm/slab.c
> > +++ b/mm/slab.c
> > @@ -4503,3 +4503,56 @@ size_t ksize(const void *objp)
> > return obj_size(virt_to_cache(objp));
> > }
> > EXPORT_SYMBOL(ksize);
> > +
> > +#ifdef CONFIG_MUDFLAP
> > +void check_slab_write(void *ptr, unsigned int sz, int type,
> > + const char *location)
> > +{
> > + struct page *page;
> > + struct kmem_cache *cachep;
> > + struct slab *slabp;
> > + char *realobj;
> > + void *objp;
> > + int objnr, size, i = 0, count, free_count = 0;
>
> Newline, please
>
> > + page = virt_to_page(ptr);
> > + page = compound_head(page);
>
> Why not virt_to_head_page() ?
>
> > + if (PageSlab(page)) {
>
> You could do
>
> if (!PageSlab(page))
> return;
>
> to reduce nesting here
>
> > + cachep = page_get_cache(page);
> > + slabp = page_get_slab(page);
> > + objnr = (unsigned)(ptr - slabp->s_mem) / cachep->buffer_size;
> > + objp = index_to_obj(cachep, slabp, objnr);
> > + size = obj_size(cachep);
> > + count = size - 1;
> > + realobj = (char *)objp + obj_offset(cachep);
> > + while (count--) {
> > + i++;
> > + if (realobj[i] == POISON_FREE)
> > + free_count++;
> > + else
> > + break;
> > + }
>
> Hmm, why not use slab_bufctl() and checking whether it's BUFCTL_FREE?
>
> > +
> > + if (i == free_count) {
> > + printk(KERN_ERR "ptr: %p, sz: %d, location: %s\n",
> > + ptr, sz, location);
> > + printk(KERN_ERR "write to freed object, size %d\n",
> > + size);
> > + i = 0;
> > + while (size--) {
> > + printk(KERN_ERR "%x", realobj[i]);
> > + i++;
> > + }
> > + printk(KERN_ERR "%x\n", realobj[i]);
> > + if (cachep->flags & SLAB_STORE_USER) {
> > + printk(KERN_ERR "Last user: [<%p>]",
> > + *dbg_userword(cachep, objp));
> > + print_symbol("(%s)",
> > + (unsigned long)*dbg_userword(cachep, objp));
> > + printk("\n");
> > + }
> > +
> > + dump_stack();
> > + }
> > + }
> > +}
> > +#endif
>
> Pekka
* Pekka Enberg <[email protected]> wrote:
> Hi Janboe,
>
> On Thu, Jul 9, 2009 at 7:13 PM, Janboe Ye<[email protected]> wrote:
> > slab debug detects the corruption when it allocates and frees the slab. But if someone write to slab memory when it already freed, slab debug could only report its last free point when slab allocates the same block.
> >
> > This information is not enough for debugging when system has complex memory usage. gcc mudflap could insert some check point when a memory is writing and it is helpful to check if the address is writing freed.
> >
> > Because most codes of kernel are drivers, it is most possible that drivers have such problem. So I use mudflap to compile all drivers.
> >
> > kmemcheck is also helpful to check same issue, and it triggers trap every reading and writing, and it need to analysis the instruct which trigger the trap to get the memory address.
> > This heavy depends on the cpu capability. arm has no single-step like x86, so kmemcheck is too heavy to run on real arm system. mudflap way is lighter than kmemcheck for this purpose.
> >
> > As stack allocation is useless for checking slab, kernel mudflap does not implement alloc which is required by gcc mudflap when codes use variable array. So variable array, such as array[i] which i could not determined by compiler, is not supported.
> >
> > Thanks a lot for comments!
>
> Yeah, something like this makes sense. Some comments below.
>
> > ?arch/arm/include/asm/elf.h | ? ?1 +
> > ?arch/arm/kernel/module.c ? | ? ?1 +
> > ?drivers/Makefile ? ? ? ? ? | ? ?4 +-
> > ?include/mf-runtime.h ? ? ? | ? 42 ++++++++++++++
> > ?kernel/Makefile ? ? ? ? ? ?| ? ?2 +-
> > ?kernel/mudflap.c ? ? ? ? ? | ?132 ++++++++++++++++++++++++++++++++++++++++++++
> > ?lib/Kconfig.debug ? ? ? ? ?| ? ?7 ++
> > ?mm/slab.c ? ? ? ? ? ? ? ? ?| ? 53 ++++++++++++++++++
> > ?8 files changed, 240 insertions(+), 2 deletions(-)
>
> SLAB is (slowly) going away so you might want to port this to SLUB
> as well so we can merge both.
and SLQB which will replace both? :-/
Ingo
Hi Ingo,
On Fri, 2009-07-10 at 10:47 +0200, Ingo Molnar wrote:
> > > arch/arm/include/asm/elf.h | 1 +
> > > arch/arm/kernel/module.c | 1 +
> > > drivers/Makefile | 4 +-
> > > include/mf-runtime.h | 42 ++++++++++++++
> > > kernel/Makefile | 2 +-
> > > kernel/mudflap.c | 132 ++++++++++++++++++++++++++++++++++++++++++++
> > > lib/Kconfig.debug | 7 ++
> > > mm/slab.c | 53 ++++++++++++++++++
> > > 8 files changed, 240 insertions(+), 2 deletions(-)
> >
> > SLAB is (slowly) going away so you might want to port this to SLUB
> > as well so we can merge both.
>
> and SLQB which will replace both? :-/
Well, I cannot really expect people to port patches to SLQB until it's
in mainline. And whether SQLB will replace SLUB remains to be seen.
We're still fixing minor issues here and there in SLUB so I have no
reason to expect SLQB stabilization to happen overnight which means
we're going to have SLUB in the tree for a while anyway.
Pekka
On Fri, Jul 10, 2009 at 11:52:00AM +0300, Pekka Enberg wrote:
> Hi Ingo,
>
> On Fri, 2009-07-10 at 10:47 +0200, Ingo Molnar wrote:
> > > > arch/arm/include/asm/elf.h | 1 +
> > > > arch/arm/kernel/module.c | 1 +
> > > > drivers/Makefile | 4 +-
> > > > include/mf-runtime.h | 42 ++++++++++++++
> > > > kernel/Makefile | 2 +-
> > > > kernel/mudflap.c | 132 ++++++++++++++++++++++++++++++++++++++++++++
> > > > lib/Kconfig.debug | 7 ++
> > > > mm/slab.c | 53 ++++++++++++++++++
> > > > 8 files changed, 240 insertions(+), 2 deletions(-)
> > >
> > > SLAB is (slowly) going away so you might want to port this to SLUB
> > > as well so we can merge both.
> >
> > and SLQB which will replace both? :-/
>
> Well, I cannot really expect people to port patches to SLQB until it's
> in mainline.
I agree...
slab allocator's primary metric is performance, and debugging
features can tend to be added very easily after that (because
there is a performance hit accepted). So I have not been spending
much time on those in SLQB.
If it gets merged and decided to stay based on performance results,
then I will do more to port over other debug features.
> And whether SQLB will replace SLUB remains to be seen.
> We're still fixing minor issues here and there in SLUB so I have no
> reason to expect SLQB stabilization to happen overnight which means
> we're going to have SLUB in the tree for a while anyway.
I think it's pretty good now. It was the right thing not to merge
it in this window (seeing as I'd forgotten to make it the default
in -next). And that flushed out a bug or two. The core logic I
think is pretty solid now though.
On Fri, 10 Jul 2009, Ingo Molnar wrote:
> > SLAB is (slowly) going away so you might want to port this to SLUB
> > as well so we can merge both.
>
> and SLQB which will replace both? :-/
>
I'm not sure what the status of slqb is, although I would have expected it
to have been pushed for inclusion in 2.6.31 as a slab allocator
alternative. Nick, any forecast for inclusion?
SLUB has a pretty noticeable performance degradation on benchmarks such as
netperf TCP_RR with high numbers of threads (see my post about it:
http://marc.info/?l=linux-kernel&m=123839191416472). CONFIG_SLAB is the
optimal configuration for workloads that share similiar slab thrashing
patterns (which my patchset dealt with in an indirect way and yet still
didn't match slab's performance). I haven't yet seen data that suggests
anything other than CONFIG_SLAB has parity with such a benchmark.
Hi Nick,
On Fri, 2009-07-10 at 11:03 +0200, Nick Piggin wrote:
> > And whether SQLB will replace SLUB remains to be seen.
> > We're still fixing minor issues here and there in SLUB so I have no
> > reason to expect SLQB stabilization to happen overnight which means
> > we're going to have SLUB in the tree for a while anyway.
>
> I think it's pretty good now. It was the right thing not to merge
> it in this window (seeing as I'd forgotten to make it the default
> in -next). And that flushed out a bug or two. The core logic I
> think is pretty solid now though.
The long-standing PowerPC issue is still open, isnt't it? But anyway, my
main point is that we've already seen from the SLAB to SLUB transition
that while most of the bugs are fixed early on, there's a "fat tail" of
problems ranging from performance regressions to slab corruption which
take a long time to be discovered and fixed up.
And I'm not trying to spread FUD on SLQB here, I'm simply stating the
facts from the previous "slab rewrite" and I have no reason to expect
this one to go any smoother. OTOH, SLQB has already had exposure in
linux-next which hopefully makes merging to mainline less painful
because 95% of the problems are ironed out. But I don't think there's
much we can do about the remaining 5% that only trigger on weird
architectures, workloads, or hardware configurations.
But I think we've been in agreement on this with Nick in the past. So I
guess my rant is directed towards Ingo who seems to be bit too eager to
merge SLQB and rm mm/sl{a,u}b.c :-).
Pekka
On Fri, Jul 10, 2009 at 02:04:53AM -0700, David Rientjes wrote:
> On Fri, 10 Jul 2009, Ingo Molnar wrote:
>
> > > SLAB is (slowly) going away so you might want to port this to SLUB
> > > as well so we can merge both.
> >
> > and SLQB which will replace both? :-/
> >
>
> I'm not sure what the status of slqb is, although I would have expected it
> to have been pushed for inclusion in 2.6.31 as a slab allocator
> alternative. Nick, any forecast for inclusion?
Just had a hiccup with testing in the last cycle, so we decided
not to merge it this time. I hope next window.
> SLUB has a pretty noticeable performance degradation on benchmarks such as
> netperf TCP_RR with high numbers of threads (see my post about it:
> http://marc.info/?l=linux-kernel&m=123839191416472). CONFIG_SLAB is the
> optimal configuration for workloads that share similiar slab thrashing
> patterns (which my patchset dealt with in an indirect way and yet still
> didn't match slab's performance). I haven't yet seen data that suggests
> anything other than CONFIG_SLAB has parity with such a benchmark.
I did do various netperf runs, but I can't remember whether I tried
to reproduce your test case with SLQB. I'll try ;)
I don't think there are any known performance regressions for SLQB
versus others, but OTOH I don't think it has been widely performance
tested (I don't think many people performance test -next).
Hi David,
On Fri, 10 Jul 2009, Ingo Molnar wrote:
> > > SLAB is (slowly) going away so you might want to port this to SLUB
> > > as well so we can merge both.
> >
> > and SLQB which will replace both? :-/
On Fri, 2009-07-10 at 02:04 -0700, David Rientjes wrote:
> I'm not sure what the status of slqb is, although I would have expected it
> to have been pushed for inclusion in 2.6.31 as a slab allocator
> alternative. Nick, any forecast for inclusion?
2.6.32 most likely. Nick has fixed a bunch of problems but there's still
one ppc boot time bug that's turning out to be hard to find.
On Fri, 2009-07-10 at 02:04 -0700, David Rientjes wrote:
> SLUB has a pretty noticeable performance degradation on benchmarks such as
> netperf TCP_RR with high numbers of threads (see my post about it:
> http://marc.info/?l=linux-kernel&m=123839191416472). CONFIG_SLAB is the
> optimal configuration for workloads that share similiar slab thrashing
> patterns (which my patchset dealt with in an indirect way and yet still
> didn't match slab's performance). I haven't yet seen data that suggests
> anything other than CONFIG_SLAB has parity with such a benchmark.
As I said before, I'm interesting in getting those patches merged. I
think Christoph raised some issues that need to be take care of before
we can do that, no?
Pekka
On Fri, Jul 10, 2009 at 12:14:23PM +0300, Pekka Enberg wrote:
> Hi Nick,
>
> On Fri, 2009-07-10 at 11:03 +0200, Nick Piggin wrote:
> > > And whether SQLB will replace SLUB remains to be seen.
> > > We're still fixing minor issues here and there in SLUB so I have no
> > > reason to expect SLQB stabilization to happen overnight which means
> > > we're going to have SLUB in the tree for a while anyway.
> >
> > I think it's pretty good now. It was the right thing not to merge
> > it in this window (seeing as I'd forgotten to make it the default
> > in -next). And that flushed out a bug or two. The core logic I
> > think is pretty solid now though.
>
> The long-standing PowerPC issue is still open, isnt't it? But anyway, my
Yes.
> main point is that we've already seen from the SLAB to SLUB transition
> that while most of the bugs are fixed early on, there's a "fat tail" of
> problems ranging from performance regressions to slab corruption which
> take a long time to be discovered and fixed up.
True.
> And I'm not trying to spread FUD on SLQB here, I'm simply stating the
> facts from the previous "slab rewrite" and I have no reason to expect
> this one to go any smoother. OTOH, SLQB has already had exposure in
> linux-next which hopefully makes merging to mainline less painful
> because 95% of the problems are ironed out. But I don't think there's
> much we can do about the remaining 5% that only trigger on weird
> architectures, workloads, or hardware configurations.
Well hopefully most of the correctness problems are sorted out,
but I think (like SLUB) most of the hard problems will be
performance related and trickle in after merge. So I'm not sure
what point we could *remove* other allocators, but for merging
SLQB I think next window should be OK.
What I would like to see is we eventualy make the hard decision
and cull 2 of them. If SLQB is not clearly better (or, if it is
clearly worse) than the other allocators and it can't be improved,
then it has failed my goals for it and I would prefer to remove it
from the tree.
I guess the hard part is how to judge this, and how long to wait
:(
On Fri, 10 Jul 2009, Pekka Enberg wrote:
> > I'm not sure what the status of slqb is, although I would have expected it
> > to have been pushed for inclusion in 2.6.31 as a slab allocator
> > alternative. Nick, any forecast for inclusion?
>
> 2.6.32 most likely. Nick has fixed a bunch of problems but there's still
> one ppc boot time bug that's turning out to be hard to find.
>
Ah, ok, there's still outstanding bugs. I was curious as to why it wasn't
merged as a non-default option that would have perhaps attracted more
attention to it.
> > SLUB has a pretty noticeable performance degradation on benchmarks such as
> > netperf TCP_RR with high numbers of threads (see my post about it:
> > http://marc.info/?l=linux-kernel&m=123839191416472). CONFIG_SLAB is the
> > optimal configuration for workloads that share similiar slab thrashing
> > patterns (which my patchset dealt with in an indirect way and yet still
> > didn't match slab's performance). I haven't yet seen data that suggests
> > anything other than CONFIG_SLAB has parity with such a benchmark.
>
> As I said before, I'm interesting in getting those patches merged. I
> think Christoph raised some issues that need to be take care of before
> we can do that, no?
>
The issue was the addition of an increment to the freeing fastpath and
some arithemetic in the allocation slowpath that would have negatively
affected performance for caches that don't suffer from the issue, even
under affected benchmarks such as netperf TCP_RR.
Even ignoring the impact on workloads that don't suffer from these
patterns, parity with slab still unfortunately isn't reached with the
patchset. It also diverges from the fundamental design of slub which is
optimized by filling up partial slabs as quickly as possible to minimize
the scanning contention on list_lock for high cpu counts and less memory
consumption overall by reducing slab internal fragmentation.
On Fri, Jul 10, 2009 at 02:31:43AM -0700, David Rientjes wrote:
> On Fri, 10 Jul 2009, Pekka Enberg wrote:
>
> > > I'm not sure what the status of slqb is, although I would have expected it
> > > to have been pushed for inclusion in 2.6.31 as a slab allocator
> > > alternative. Nick, any forecast for inclusion?
> >
> > 2.6.32 most likely. Nick has fixed a bunch of problems but there's still
> > one ppc boot time bug that's turning out to be hard to find.
> >
>
> Ah, ok, there's still outstanding bugs. I was curious as to why it wasn't
> merged as a non-default option that would have perhaps attracted more
> attention to it.
Don't know whether it should go in as non-default or not. The plan
is to kind of kick up a stink and make people care about the slab
issue again rather than quietly add more slab allocators to the
tree...
Perhaps making it non-default for one release would be a good idea,
I don't know.
Hi Nick,
On Fri, 2009-07-10 at 11:29 +0200, Nick Piggin wrote:
> What I would like to see is we eventualy make the hard decision
> and cull 2 of them. If SLQB is not clearly better (or, if it is
> clearly worse) than the other allocators and it can't be improved,
> then it has failed my goals for it and I would prefer to remove it
> from the tree.
I'm not sure it's such a hard decision. SLAB is on it's way out because
SLUB and SLQB code are much cleaner and the debugging support is better.
As for SLUB vs. SLQB, I'm hoping to trick you, Mel, and others into
doing an "epic slab allocator performance battle" and post the numbers
once we have have everything merged to mainline. My gut feeling is that
SLQB is in the lead because SLUB hasn't addressed the Intel regression.
Pekka
On Fri, 10 Jul 2009, Nick Piggin wrote:
> What I would like to see is we eventualy make the hard decision
> and cull 2 of them. If SLQB is not clearly better (or, if it is
> clearly worse) than the other allocators and it can't be improved,
> then it has failed my goals for it and I would prefer to remove it
> from the tree.
>
The design of slqb, to your credit, appears more extendable than slub and
addressing issues such as the netperf regression (which only manifests
noticeably on my systems with >= 16 cpus) should be easier since they
won't be antagonist to the allocator's inherent design.
The reason why I'd like to see slqb merged as a non-default alternative is
because it exposes the allocator to more benchmarking coverage than would
be possible in any other environment and attracts much more attention to
development for those who don't pull Pekka's tree directly.
Hi David,
On Fri, 2009-07-10 at 02:41 -0700, David Rientjes wrote:
> The reason why I'd like to see slqb merged as a non-default alternative is
> because it exposes the allocator to more benchmarking coverage than would
> be possible in any other environment and attracts much more attention to
> development for those who don't pull Pekka's tree directly.
Yes, we can do that. But I'm not going to ask Linus to pull something
that has known _boot-time_ bugs (that we haven't been able to figure out
in over three months!) because it's a pretty clear indication that we're
not on top of things yet.
So if you want to fast-track SLQB, feel free to help out Nick in
debugging that sucker ;).
Pekka
On Fri, 10 Jul 2009, Pekka Enberg wrote:
> I'm not sure it's such a hard decision. SLAB is on it's way out because
> SLUB and SLQB code are much cleaner and the debugging support is better.
I don't think that is good enough reason. CONFIG_SLAB is by far the
optimal choice for netperf TCP_RR on >= 16 cpus and pushing it out of the
tree, even though it is no longer the default option, isn't an option for
those of us who live by performance even if it comes at the cost of a
dirtier implementation (enhanced debugging support doesn't even register
on my radar since it's useless in a production environment).
Hi David,
On Fri, 10 Jul 2009, Pekka Enberg wrote:
> > I'm not sure it's such a hard decision. SLAB is on it's way out because
> > SLUB and SLQB code are much cleaner and the debugging support is better.
On Fri, 2009-07-10 at 02:47 -0700, David Rientjes wrote:
> I don't think that is good enough reason. CONFIG_SLAB is by far the
> optimal choice for netperf TCP_RR on >= 16 cpus and pushing it out of the
> tree, even though it is no longer the default option, isn't an option for
> those of us who live by performance even if it comes at the cost of a
> dirtier implementation (enhanced debugging support doesn't even register
> on my radar since it's useless in a production environment).
Hey, I said SLAB is on its way out (yes, it really is). But I didn't say
we're going to blindly remove it if performs better than the
alternatives. I don't see any reason why SQLB can't reach the same
performance as SLAB after on fundamental level, though. Can you?
Pekka
On Fri, 10 Jul 2009, Pekka Enberg wrote:
> Hey, I said SLAB is on its way out (yes, it really is). But I didn't say
> we're going to blindly remove it if performs better than the
> alternatives. I don't see any reason why SQLB can't reach the same
> performance as SLAB after on fundamental level, though. Can you?
>
I'm not really interested in making predictions on which design has the
greatest potential for pure performance, I'm interested in what is proven
to work and does the job better than any alternative. Right now, for
certain workloads, that's undeniably slab. So I'd disagree that slab is
on its way out until another allocator is shown to at least have parity
with it (at which time I'd become more interested in the cleanliness of
the code, the debugging support, etc.).
It's my opinion that slab is on its way out when there's no benchmark that
shows it is superior by any significant amount. If that happens (and if
its successor is slub, slqb, or a yet to be implemented allocator), we can
probably start a discussion on what's in and what's out at that time.
Hi David,
On Fri, Jul 10, 2009 at 1:03 PM, David Rientjes<[email protected]> wrote:
> It's my opinion that slab is on its way out when there's no benchmark that
> shows it is superior by any significant amount. ?If that happens (and if
> its successor is slub, slqb, or a yet to be implemented allocator), we can
> probably start a discussion on what's in and what's out at that time.
Performance matters a lot, but it's certainly not the only priority
here. Look at slab, it's bootstrap code is a train-wreck which bit us
in the ass when we did the earlyslab thing and the NUMA support is
pretty horrible. The code base hasn't received much attention for the
past few years so unless someone steps up to clean it all, it's on
it's way out, like it or not.
So if you care about performance and have benchmarks that are _known_
to regress, you might want to focus your efforts on SLUB and/or SLQB
because that's where the action happens these days.
Pekka
On Fri, Jul 10, 2009 at 01:10:11PM +0300, Pekka Enberg wrote:
> Hi David,
>
> On Fri, Jul 10, 2009 at 1:03 PM, David Rientjes<[email protected]> wrote:
> > It's my opinion that slab is on its way out when there's no benchmark that
> > shows it is superior by any significant amount. ?If that happens (and if
> > its successor is slub, slqb, or a yet to be implemented allocator), we can
> > probably start a discussion on what's in and what's out at that time.
>
> Performance matters a lot, but it's certainly not the only priority
> here. Look at slab, it's bootstrap code is a train-wreck which bit us
> in the ass when we did the earlyslab thing and the NUMA support is
> pretty horrible. The code base hasn't received much attention for the
> past few years so unless someone steps up to clean it all, it's on
> it's way out, like it or not.
>
> So if you care about performance and have benchmarks that are _known_
> to regress, you might want to focus your efforts on SLUB and/or SLQB
> because that's where the action happens these days.
Well OTOH performance is something that has some pretty fixed limits
by the design. If SLQB has any performance problems I can't fix, I
do intend to try to take the SLAB base allocator design and implement
it with more modern SL[UQ]B coding style and bootstrap code.
The reason I made some changes with SLQB is because I was trying to
remove nr_cpus*nr_nodes*lots allocations for the alien caches, and
I think the linked list style of queueing allows you to be more flexible
in queue sizes, and more cache efficient (especially when moving them
around).
We'll see...
On Fri, 10 Jul 2009, Nick Piggin wrote:
> Don't know whether it should go in as non-default or not. The plan
> is to kind of kick up a stink and make people care about the slab
> issue again rather than quietly add more slab allocators to the
> tree...
>
> Perhaps making it non-default for one release would be a good idea,
> I don't know.
Just merging it as non default could be done now as far as I can tell.
Then there may be more people working on the stabilization. Then we can
have endless discussions on performance.
The SLUB work is still not complete since the intended final fast path
requires new per cpu operations that get thrown one curvball after
another.
On Fri, Jul 10, 2009 at 03:03:19AM -0700, David Rientjes wrote:
> On Fri, 10 Jul 2009, Pekka Enberg wrote:
>
> > Hey, I said SLAB is on its way out (yes, it really is). But I didn't say
> > we're going to blindly remove it if performs better than the
> > alternatives. I don't see any reason why SQLB can't reach the same
> > performance as SLAB after on fundamental level, though. Can you?
> >
>
> I'm not really interested in making predictions on which design has the
> greatest potential for pure performance, I'm interested in what is proven
> to work and does the job better than any alternative. Right now, for
> certain workloads, that's undeniably slab. So I'd disagree that slab is
> on its way out until another allocator is shown to at least have parity
> with it (at which time I'd become more interested in the cleanliness of
> the code, the debugging support, etc.).
>
> It's my opinion that slab is on its way out when there's no benchmark that
> shows it is superior by any significant amount. If that happens (and if
> its successor is slub, slqb, or a yet to be implemented allocator), we can
> probably start a discussion on what's in and what's out at that time.
How are you running your netperf test? Over localhost or remotely?
It is a 16 core system? NUMA?
It seems pretty variable when I run it here, although there seems
to be a pretty clear upper bound on performance, where a lot of the
results land around (then others go anywhere down to less than half
that performance).
Anyway, tried to get an idea of performance on my 8 core NUMA system,
over localhost, and just at 64 threads. Ran the test 60 times for
each allocator.
Rates for 2.6.31-rc2 (+slqb from Pekka's tree)
SLAB: 1869710
SLQB: 1859710
SLUB: 1769400
Slab did have slightly higher maximal numbers too, although slqb
SLQB had the highest minimum. But both were fairly similar there.
SLUB's minimum went down to around 13% lower than the others.
Now I didn't reboot or restart netperf server during runs, so there
is possibility of results drifting for some reason (eg. due to
cache/node placment).
I can't say SLQB beats SLAB here, but it's fairly good. I'll see
if any tweaks can improve it further...
On Wed, 15 Jul 2009, Nick Piggin wrote:
> > It's my opinion that slab is on its way out when there's no benchmark that
> > shows it is superior by any significant amount. If that happens (and if
> > its successor is slub, slqb, or a yet to be implemented allocator), we can
> > probably start a discussion on what's in and what's out at that time.
>
> How are you running your netperf test? Over localhost or remotely?
> It is a 16 core system? NUMA?
>
I ran it remotely using two machines on the same rack. Both were four
quad-core UMA systems.
> It seems pretty variable when I run it here, although there seems
> to be a pretty clear upper bound on performance, where a lot of the
> results land around (then others go anywhere down to less than half
> that performance).
>
My results from my slub partial slab thrashing patchset comparing slab and
slub were with a variety of different thread counts, each a multiple of
the number of cores. The most notable slub regression always appeared in
the higher thread counts with this script:
#!/bin/bash
TIME=60 # seconds
HOSTNAME=hostname.goes.here # netserver
NR_CPUS=$(grep ^processor /proc/cpuinfo | wc -l)
echo NR_CPUS=$NR_CPUS
run_netperf() {
for i in $(seq 1 $1); do
netperf -H $HOSTNAME -t TCP_RR -l $TIME &
done
}
ITERATIONS=0
while [ $ITERATIONS -lt 10 ]; do
RATE=0
ITERATIONS=$[$ITERATIONS + 1]
THREADS=$[$NR_CPUS * $ITERATIONS]
RESULTS=$(run_netperf $THREADS | grep -v '[a-zA-Z]' | awk '{ print $6 }')
for j in $RESULTS; do
RATE=$[$RATE + ${j/.*}]
done
echo threads=$THREADS rate=$RATE
done
> Anyway, tried to get an idea of performance on my 8 core NUMA system,
> over localhost, and just at 64 threads. Ran the test 60 times for
> each allocator.
>
> Rates for 2.6.31-rc2 (+slqb from Pekka's tree)
> SLAB: 1869710
> SLQB: 1859710
> SLUB: 1769400
>
Great, slqb doesn't regress nearly as much as slub did.
These statistics do show that pulling slab out in favor of slub
prematurely is probably inadvisible, however, when the performance
achieved with slab in this benchmark is far beyond slub's upper bound.
> Now I didn't reboot or restart netperf server during runs, so there
> is possibility of results drifting for some reason (eg. due to
> cache/node placment).
>
SLUB should perform slightly better after the first run on a NUMA system
since its partial lists (for kmalloc-256 and kmalloc-2048) should be
populated with free slabs, which avoid costly page allocations, because of
min_partial.
On Wed, Jul 15, 2009 at 01:19:27PM -0700, David Rientjes wrote:
> On Wed, 15 Jul 2009, Nick Piggin wrote:
>
> > > It's my opinion that slab is on its way out when there's no benchmark that
> > > shows it is superior by any significant amount. If that happens (and if
> > > its successor is slub, slqb, or a yet to be implemented allocator), we can
> > > probably start a discussion on what's in and what's out at that time.
> >
> > How are you running your netperf test? Over localhost or remotely?
> > It is a 16 core system? NUMA?
> >
>
> I ran it remotely using two machines on the same rack. Both were four
> quad-core UMA systems.
OK, I feared as much ;) I concede localhost testing isn't a good
idea when doing networking tests (especially where cache hotness
of things is concerned). So my numbers don't mean too much.
I'll try to rig up some remote tests (don't have very good network
setups here). Are you just using single 1GbE, or more?
> > It seems pretty variable when I run it here, although there seems
> > to be a pretty clear upper bound on performance, where a lot of the
> > results land around (then others go anywhere down to less than half
> > that performance).
> >
>
> My results from my slub partial slab thrashing patchset comparing slab and
> slub were with a variety of different thread counts, each a multiple of
> the number of cores. The most notable slub regression always appeared in
> the higher thread counts with this script:
Yes, I was using your script you put in the earlier post. I did
run varying numbers of threads, but so much noise in the results
that I just cut it to a single (high) thread count to get enough
runs.
> #!/bin/bash
>
> TIME=60 # seconds
> HOSTNAME=hostname.goes.here # netserver
>
> NR_CPUS=$(grep ^processor /proc/cpuinfo | wc -l)
> echo NR_CPUS=$NR_CPUS
>
> run_netperf() {
> for i in $(seq 1 $1); do
> netperf -H $HOSTNAME -t TCP_RR -l $TIME &
> done
> }
>
> ITERATIONS=0
> while [ $ITERATIONS -lt 10 ]; do
> RATE=0
> ITERATIONS=$[$ITERATIONS + 1]
> THREADS=$[$NR_CPUS * $ITERATIONS]
> RESULTS=$(run_netperf $THREADS | grep -v '[a-zA-Z]' | awk '{ print $6 }')
>
> for j in $RESULTS; do
> RATE=$[$RATE + ${j/.*}]
> done
> echo threads=$THREADS rate=$RATE
> done
>
> > Anyway, tried to get an idea of performance on my 8 core NUMA system,
> > over localhost, and just at 64 threads. Ran the test 60 times for
> > each allocator.
> >
> > Rates for 2.6.31-rc2 (+slqb from Pekka's tree)
> > SLAB: 1869710
> > SLQB: 1859710
> > SLUB: 1769400
> >
>
> Great, slqb doesn't regress nearly as much as slub did.
No, although let's see if I can get some remote numbers.
> These statistics do show that pulling slab out in favor of slub
> prematurely is probably inadvisible, however, when the performance
> achieved with slab in this benchmark is far beyond slub's upper bound.
Well these, and others, I agree. You're not the only one unhappy
with slub performance (and I wouldn't have wasted my time with
slqb if I was happy with it). I think we can give SLUB some more
time though if Christoph has some improvements.
> > Now I didn't reboot or restart netperf server during runs, so there
> > is possibility of results drifting for some reason (eg. due to
> > cache/node placment).
> >
>
> SLUB should perform slightly better after the first run on a NUMA system
> since its partial lists (for kmalloc-256 and kmalloc-2048) should be
> populated with free slabs, which avoid costly page allocations, because of
> min_partial.
Yeah, although it's probably within the noise really. But I can
turn it into a pseudo-UMA system as well, so I will try with that
too.
Thanks,
Nick