2007-06-07 21:59:48

by Christoph Lameter

[permalink] [raw]
Subject: [patch 00/12] Slab defragmentation V3

Will show up shortly at http://ftp.kernel.org/pub/linux/kernel/people/christoph/slab-defrag/

Test results (see appended scripts / user space code for more data)

(3 level tree with 10 entries at first level , 20 at the second and 30 files at the
third level. Files at the lowest level were removed to create inode fragmentation)

%Ra is the allocation ratio (need to apply the slabinfo patch to get those numbers)

inode reclaim in reiserfs

Name Objects Objsize Space Slabs/Part/Cpu O/S O %Ra %Ef Flg
dentry 14660 200 3.0M 733/0/1 20 0 100 97 Da
reiser_inode_cache 1596 640 4.1M 256/201/1 25 2 24 24 DCa

Status after defrag

Name Objects Objsize Space Slabs/Part/Cpu O/S O %Ra %Ef Flg
dentry 8849 200 1.8M 454/17/1 20 0 97 95 Da
reiser_inode_cache 1381 640 1.0M 65/11/0 25 2 84 82 DCa



Slab defragmentation can be triggered in two ways:

1. Manually by running

slabinfo -s <slabs-to-shrink>

or manually by the kernel calling

kmem_cache_shrink(slab)

(Currently only ACPI is doing such a call to a slab that has no
defragmentation support. In that case we simply do what SLAB does:
drop per cpu caches and sift through partial list for free slabs).

2. Automatically if defragmentable slabs reach a certain degree of
fragmentation.

The point where slab defragmentation occurs is can be set at

/proc/sys/vm/slab_defrag_ratio

Slab fragmentation is measured by how much of the possible objects in a
slab are in use. The default setting for slab_defrag_ratio is 30%. This
means that slab fragmentation is going to be triggered if there are more than
3 free object slots for each allocated object.

Setting the slab_defrag_ratio higher will cause more defragmentation runs.
If slab_defrag_ratio is set to 0 then no slab defragmentation occurs.

Slabs are checked for their fragmentation levels after the slabs have been shrunk
by running shrinkers in vm/scan.c during memory reclaim. This means that slab
defragmentation is only triggered if we are under memory pressure and if there is
significant slab fragmentation.

V1->V2
- Clean up control flow using a state variable. Simplify API. Back to 2
functions that now take arrays of objects.
- Inode defrag support for a set of filesystems
- Fix up dentry defrag support to work on negative dentries by adding
a new dentry flag that indicates that a dentry is not in the process
of being freed or allocated.

V2->V3
- Support directory reclaim
- Add infrastructure to trigger slab defrag after slab shrinking if we
have slabs with a high degree of fragmentation.



Test script:

#!/bin/sh

echo 30 >/proc/sys/vm/slab_defrag_ratio

./gazfiles c 3 10 20 30
echo "Status before"
slabinfo -D
./gazfiles d 2
echo "Status after removing files"
slabinfo -D
slabinfo -s
echo "Status after defrag"
slabinfo -D
./gazfiles d 0


gazfiles.c :

/*
* Create a gazillion of files to be able to create slab fragmentation
*
* (C) 2007 sgi, Christoph Lameter <[email protected]>
*
* Create a n layered hierachy of files of empty files
*
* gazfiles <action> <levels> <n1> <n2> ...
*
* gazfiles c[reate] 3 50 50 50
*
* gazfiles s[hrink] <levels>
*
* gazfiles r[andomkill] <nr to kill>
*/

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <dirent.h>
#include <string.h>
#include <unistd.h>
#include <stdarg.h>
#include <getopt.h>
#include <regex.h>
#include <errno.h>

#define MAXIMUM_LEVELS 10

int level;
int sizes[MAXIMUM_LEVELS];

void fatal(const char *x, ...)
{
va_list ap;

va_start(ap, x);
vfprintf(stderr, x, ap);
va_end(ap);
exit(1);
}

int read_gaz(void)
{
FILE *f = fopen(".gazinfo", "r");
int rc = 0;
int i;

if (!f)
return 0;

if (!fscanf(f, "%d", &level))
goto out;

if (level >= MAXIMUM_LEVELS)
goto out;

for (i = 0; i < level; i++)
if (!fscanf(f, " %d", &sizes[i]))
goto out;
rc = 1;
out:
fclose(f);
return rc;
}

void write_gaz(void)
{
FILE *f = fopen(".gazinfo","w");
int i;

fprintf(f, "%d",level);
for (i = 0; i < level; i++)
fprintf(f," %d", sizes[i]);
fprintf(f, "\n");
fclose(f);
}

void cre(int l)
{
int i;

for (i = 0; i < sizes[l - 1]; i++) {
char name[20];

sprintf(name, "%03d", i);

if (l < level) {
mkdir(name, 0775);
chdir(name);
cre(l + 1);
chdir("..");
} else {
FILE *f;

f = fopen(name,"w");
fprintf(f, "Test");
fclose(f);
}
}
}

void create(int l, char **sz)
{
int i;

level = l;
for (i = 0; i < level; i++)
sizes[i] = atoi(sz[i]);

if (mkdir("gazf", 0775))
fatal("Cannot create gazf here\n");
chdir("gazf");
write_gaz();
cre(1);
chdir("..");
}

void shrink(int level)
{
if (chdir("gazf"))
fatal("No gazfiles in this directory");
read_gaz();
chdir("..");
}

void scand(int l, void (*func)(int, int, char *, unsigned long),
unsigned long level)
{
DIR *dir;
struct dirent *de;

dir = opendir(".");
if (!dir)
fatal("Cannot open directory");
while ((de = readdir(dir))) {
struct stat s;

if (de->d_name[0] == '.')
continue;

/*
* Some idiot broke the glibc library or made it impossible
* to figure out how to make readdir work right
*/

stat(de->d_name, &s);
if (S_ISDIR(s.st_mode))
de->d_type = DT_DIR;

if (de->d_type == DT_DIR) {
if (chdir(de->d_name))
fatal("Cannot enter %s", de->d_name);
scand(l + 1, func, level);
chdir("..");
func(l, 1, de->d_name, level);
} else {
func(l, 0, de->d_name, level);
}
}
closedir(dir);
}

void traverse(void (*func)(int, int, char *, unsigned long),
unsigned long level)
{
if (chdir("gazf"))
fatal("No gazfiles in this directory");
scand(1, func, level);
chdir("..");
}

void randomkill(int nr)
{
if (chdir("gazf"))
fatal("No gazfiles in this directory");
read_gaz();
chdir("..");
}

void del_func(int l, int dir, char *name, unsigned long level)
{
if (l <= level)
return;
if (dir) {
if (rmdir(name))
fatal("Cannot remove directory %s");
} else {
if (unlink(name))
fatal("Cannot unlink file %s");
}
}

void delete(int l)
{
if (l == 0) {
system("rm -rf gazf");
return;
}
traverse(del_func, l);
}

void usage(void)
{
printf("gazfiles: Tool to manage gazillions of files\n\n");
printf("gazfiles create <levels> <#l1> <#l2> ...\n");
printf("gazfiles delete <levels>\n");
printf("gazfiles shrink <levels>\n");
printf("gazfiles randomkill <nr>\n\n");
printf("(C) 2007 sgi, Christoph Lameter <[email protected]>\n");
exit(0);
}

int main(int argc, char *argv[])
{
if (argc < 2)
usage();

switch (argv[1][0]) {
case 'c' :
create(atoi(argv[2]), argv + 3);
break;
case 's' :
if (argc != 3)
usage();

shrink(atoi(argv[2]));
break;
case 'r' :
if (argc != 3)
usage();

randomkill(atoi(argv[2]));
break;
case 'd':
if (argc != 3)
usage();
delete(atoi(argv[2]));
break;

default:
usage();
}
return 0;
}
--


2007-06-08 15:16:31

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 00/12] Slab defragmentation V3

On Fri, 8 Jun 2007, Michal Piotrowski wrote:

> Hi Christoph,
>
> On 07/06/07, [email protected] <[email protected]> wrote:
> > Will show up shortly at
> http://ftp.kernel.org/pub/linux/kernel/people/christoph/slab-defrag/
>
> I tried to apply this patchset, but without success. I tried
> 2.6.22-rc4-mm2, 2.6.22-rc4, 2.6.22-rc4-git2, 2.6.22-rc3...

What was the problem?

2007-06-08 15:28:43

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 00/12] Slab defragmentation V3

On Fri, 8 Jun 2007, Michal Piotrowski wrote:

> Hi Christoph,
>
> On 07/06/07, [email protected] <[email protected]> wrote:
> > Will show up shortly at
> http://ftp.kernel.org/pub/linux/kernel/people/christoph/slab-defrag/
>
> I tried to apply this patchset, but without success. I tried
> 2.6.22-rc4-mm2, 2.6.22-rc4, 2.6.22-rc4-git2, 2.6.22-rc3...

Yeah its against 2.6.22-rc4-mm1 and 2.6.22-rc4-mm2 changes kernel/sysctl.c
so that the defrag trigger patch fails. Sigh.

I added kernel versions below slab-defrag so that you can find the correct
version for your kernel.

2007-06-08 18:02:29

by Michal Piotrowski

[permalink] [raw]
Subject: Re: [patch 00/12] Slab defragmentation V3

bash shared mapping + your script in a loop
while true; do sudo ./run.sh; done > res3.txt


[ 2866.154597] =======================================================
[ 2866.162384] [ INFO: possible circular locking dependency detected ]
[ 2866.168698] 2.6.22-rc4-mm2 #1
[ 2866.171671] -------------------------------------------------------
[ 2866.177972] bash-shared-map/3245 is trying to acquire lock:
[ 2866.183566] (slub_lock){----}, at: [<c0482510>] kmem_cache_defrag+0x18/0xb3

l *kmem_cache_defrag+0x18
0xc1082510 is in kmem_cache_defrag (mm/slub.c:2742).
2737 struct kmem_cache *s;
2738 unsigned long pages = 0;
2739 void *scratch;
2740
2741 down_read(&slub_lock);
2742 list_for_each_entry(s, &slab_caches, list) {
2743
2744 /*
2745 * The slab cache must have defrag methods.
2746 */


[ 2866.190800]
[ 2866.190801] but task is already holding lock:
[ 2866.196746] (&inode->i_alloc_sem){--..}, at: [<c0498b07>] notify_change+0xdf/0x2ec

l *notify_change+0xdf
0xc1098b07 is in notify_change (fs/attr.c:145).
140 return 0;
141
142 if (ia_valid & ATTR_SIZE)
143 down_write(&dentry->d_inode->i_alloc_sem);
144
145 if (inode->i_op && inode->i_op->setattr) {
146 error = security_inode_setattr(dentry, attr);
147 if (!error)
148 error = inode->i_op->setattr(dentry, attr);
149 } else {


[ 2866.204761]
[ 2866.204762] which lock already depends on the new lock.
[ 2866.204764]
[ 2866.213058]
[ 2866.213060] the existing dependency chain (in reverse order) is:
[ 2866.220630]
[ 2866.220631] -> #2 (&inode->i_alloc_sem){--..}:
[ 2866.226784] [<c0441df1>] add_lock_to_list+0x67/0x8b
[ 2866.232525] [<c0444bb9>] __lock_acquire+0xb02/0xd36
[ 2866.238315] [<c0444e8b>] lock_acquire+0x9e/0xb8
[ 2866.243702] [<c043c0c5>] down_write+0x3e/0x77
[ 2866.248914] [<c0498b07>] notify_change+0xdf/0x2ec
[ 2866.254542] [<c0484161>] do_truncate+0x60/0x79
[ 2866.259927] [<c048d5fe>] may_open+0x1db/0x240
[ 2866.265165] [<c048fbbd>] open_namei+0x2d6/0x6bb
[ 2866.270602] [<c0483a5d>] do_filp_open+0x26/0x3b
[ 2866.275996] [<c0483acf>] do_sys_open+0x5d/0xed
[ 2866.281382] [<c0483b97>] sys_open+0x1c/0x1e
[ 2866.286508] [<c0404182>] sysenter_past_esp+0x5f/0x99
[ 2866.292428] [<b7f9d410>] 0xb7f9d410
[ 2866.296819] [<ffffffff>] 0xffffffff
[ 2866.301177]
[ 2866.301178] -> #1 (&sysfs_inode_imutex_key){--..}:
[ 2866.307632] [<c0441df1>] add_lock_to_list+0x67/0x8b
[ 2866.313425] [<c0444bb9>] __lock_acquire+0xb02/0xd36
[ 2866.319164] [<c0444e8b>] lock_acquire+0x9e/0xb8
[ 2866.324576] [<c065b745>] __mutex_lock_slowpath+0x107/0x369
[ 2866.331008] [<c065b9c3>] mutex_lock+0x1c/0x1f
[ 2866.336314] [<c04c2609>] create_dir+0x1e/0x1c2
[ 2866.341682] [<c04c280d>] sysfs_create_dir+0x60/0x7b
[ 2866.347396] [<c050a335>] kobject_shadow_add+0xd7/0x189
[ 2866.353499] [<c050a3f1>] kobject_add+0xa/0xc
[ 2866.358685] [<c0480f00>] sysfs_slab_add+0x10c/0x152
[ 2866.364374] [<c048111b>] kmem_cache_create+0x13a/0x1d4
[ 2866.370442] [<c083415d>] fasync_init+0x2e/0x37
[ 2866.375818] [<c0824542>] kernel_init+0x14e/0x2bf
[ 2866.381351] [<c0404e7b>] kernel_thread_helper+0x7/0x10
[ 2866.387419] [<ffffffff>] 0xffffffff
[ 2866.391843]
[ 2866.391845] -> #0 (slub_lock){----}:
[ 2866.397022] [<c0442b04>] print_circular_bug_tail+0x2e/0x68
[ 2866.403359] [<c0444aa5>] __lock_acquire+0x9ee/0xd36
[ 2866.409080] [<c0444e8b>] lock_acquire+0x9e/0xb8
[ 2866.414466] [<c043bfff>] down_read+0x3d/0x74
[ 2866.419635] [<c0482510>] kmem_cache_defrag+0x18/0xb3
[ 2866.425540] [<c046c7ac>] shrink_slab+0x1ca/0x1d5
[ 2866.431002] [<c046cc1d>] try_to_free_pages+0x178/0x224
[ 2866.437044] [<c046824f>] __alloc_pages+0x1cd/0x324
[ 2866.442794] [<c0465282>] find_or_create_page+0x5c/0xa6
[ 2866.448817] [<c04c9379>] ext3_truncate+0xbb/0x83b
[ 2866.454411] [<c0472470>] vmtruncate+0x11a/0x140
[ 2866.459762] [<c049894d>] inode_setattr+0x5c/0x137
[ 2866.465286] [<c04caafb>] ext3_setattr+0x19c/0x1f8
[ 2866.470835] [<c0498b61>] notify_change+0x139/0x2ec
[ 2866.476514] [<c0484161>] do_truncate+0x60/0x79
[ 2866.481822] [<c04842af>] do_sys_ftruncate+0x135/0x150
[ 2866.487778] [<c04842e5>] sys_ftruncate64+0x1b/0x1d
[ 2866.493405] [<c040420c>] syscall_call+0x7/0xb
[ 2866.498599] [<b7f10410>] 0xb7f10410
[ 2866.502913] [<ffffffff>] 0xffffffff
[ 2866.507201]
[ 2866.507203] other info that might help us debug this:
[ 2866.507204]
[ 2866.515363] 2 locks held by bash-shared-map/3245:
[ 2866.520151] #0: (&inode->i_mutex){--..}, at: [<c065b9c3>] mutex_lock+0x1c/0x1f
[ 2866.527826] #1: (&inode->i_alloc_sem){--..}, at: [<c0498b07>] notify_change+0xdf/0x2ec
[ 2866.536158]
[ 2866.536160] stack backtrace:
[ 2866.540597] [<c04052ad>] dump_trace+0x63/0x1eb
[ 2866.545187] [<c040544f>] show_trace_log_lvl+0x1a/0x2f
[ 2866.550426] [<c040608d>] show_trace+0x12/0x14
[ 2866.555005] [<c04060a5>] dump_stack+0x16/0x18
[ 2866.559552] [<c0442b35>] print_circular_bug_tail+0x5f/0x68
[ 2866.565216] [<c0444aa5>] __lock_acquire+0x9ee/0xd36
[ 2866.570264] [<c0444e8b>] lock_acquire+0x9e/0xb8
[ 2866.574991] [<c043bfff>] down_read+0x3d/0x74
[ 2866.579487] [<c0482510>] kmem_cache_defrag+0x18/0xb3
[ 2866.584664] [<c046c7ac>] shrink_slab+0x1ca/0x1d5
[ 2866.589462] [<c046cc1d>] try_to_free_pages+0x178/0x224
[ 2866.594796] [<c046824f>] __alloc_pages+0x1cd/0x324
[ 2866.599800] [<c0465282>] find_or_create_page+0x5c/0xa6
[ 2866.605099] [<c04c9379>] ext3_truncate+0xbb/0x83b
[ 2866.609974] [<c0472470>] vmtruncate+0x11a/0x140
[ 2866.614695] [<c049894d>] inode_setattr+0x5c/0x137
[ 2866.619578] [<c04caafb>] ext3_setattr+0x19c/0x1f8
[ 2866.624470] [<c0498b61>] notify_change+0x139/0x2ec
[ 2866.629441] [<c0484161>] do_truncate+0x60/0x79
[ 2866.634075] [<c04842af>] do_sys_ftruncate+0x135/0x150
[ 2866.639339] [<c04842e5>] sys_ftruncate64+0x1b/0x1d
[ 2866.644310] [<c040420c>] syscall_call+0x7/0xb
[ 2866.648823] [<b7f10410>] 0xb7f10410
[ 2866.652482] =======================

http://www.stardust.webpages.pl/files/tbf/bitis-gabonica/2.6.22-rc4-mm2-sd3/sd-dmesg
http://www.stardust.webpages.pl/files/tbf/bitis-gabonica/2.6.22-rc4-mm2-sd3/sd-config

Regards,
Michal

--
"Najbardziej brakowa?o mi twojego milczenia."
-- Andrzej Sapkowski "Co? wi?cej"

2007-06-08 18:16:23

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 00/12] Slab defragmentation V3

On Fri, 8 Jun 2007, Michal Piotrowski wrote:

> bash shared mapping + your script in a loop
> while true; do sudo ./run.sh; done > res3.txt

Hmmmm... Seems to be triggered from the reclaim path kmem_cache_defrag
rather than the manual triggered one from the script. Taking the slub_lock
on the reclaim path is an issue it seems.

Maybe we need to do a trylock in kmem_cache_defrag to defuse the
situation? This is after all an optimization so we can bug out.

Does this fix it?

---
mm/slub.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)

Index: slub/mm/slub.c
===================================================================
--- slub.orig/mm/slub.c 2007-06-08 11:12:40.000000000 -0700
+++ slub/mm/slub.c 2007-06-08 11:14:34.000000000 -0700
@@ -2738,7 +2738,9 @@ int kmem_cache_defrag(int percent, int n
unsigned long pages = 0;
void *scratch;

- down_read(&slub_lock);
+ if (!down_read_trylock(&slub_lock))
+ return 0;
+
list_for_each_entry(s, &slab_caches, list) {

/*


2007-06-08 19:08:38

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 00/12] Slab defragmentation V3

On Fri, 8 Jun 2007, Michal Piotrowski wrote:

> Yes, it does. Thanks!

Ahhh... That leds to the discovery more sysfs problems. I need to make
sure not to be holding locks while calling into sysfs. More cleanup...

2007-06-08 19:32:57

by Michal Piotrowski

[permalink] [raw]
Subject: Re: [patch 00/12] Slab defragmentation V3

Christoph Lameter pisze:
> On Fri, 8 Jun 2007, Michal Piotrowski wrote:
>
>> Yes, it does. Thanks!
>
> Ahhh... That leds to the discovery more sysfs problems. I need to make
> sure not to be holding locks while calling into sysfs. More cleanup...
>
>

sysfs... I forgot about my sysfs test case

#! /bin/sh

for i in `find /sys/ -type f`
do
echo "wyświetlam $i"
sudo cat $i > /dev/null
# sleep 1s
done

[ 2816.175573] BUG: sleeping function called from invalid context at mm/page_alloc.c:1547
[ 2816.183578] in_atomic():1, irqs_disabled():1
[ 2816.187946] 1 lock held by cat/12586:
[ 2816.191705] #0: (&n->list_lock){++..}, at: [<c0481630>] list_locations+0x3d/0x26b

l *list_locations+0x3d
0xc1081630 is in list_locations (mm/slub.c:3388).
3383 struct page *page;
3384
3385 if (!atomic_read(&n->nr_slabs))
3386 continue;
3387
3388 spin_lock_irqsave(&n->list_lock, flags);
3389 list_for_each_entry(page, &n->partial, lru)
3390 process_slab(&t, s, page, alloc);
3391 list_for_each_entry(page, &n->full, lru)
3392 process_slab(&t, s, page, alloc);


[ 2816.199571] irq event stamp: 11526
[ 2816.203054] hardirqs last enabled at (11525): [<c042adbd>] on_each_cpu+0x3b/0x71
[ 2816.210689] hardirqs last disabled at (11526): [<c065d241>] _spin_lock_irqsave+0x13/0x6e
[ 2816.218910] softirqs last enabled at (11236): [<c042b5dd>] __do_softirq+0xdf/0xe5
[ 2816.226635] softirqs last disabled at (11229): [<c0406d65>] do_softirq+0x68/0x11f

l *on_each_cpu+0x3b
0xc102adbd is in on_each_cpu (include/asm/irqflags.h:36).
31 asm volatile("cli": : :"memory");
32 }
33
34 static inline void native_irq_enable(void)
35 {
36 asm volatile("sti": : :"memory");
37 }
38
39 static inline void native_safe_halt(void)
40 {

l *_spin_lock_irqsave+0x13
0xc125d241 is in _spin_lock_irqsave (kernel/spinlock.c:84).
79 unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
80 {
81 unsigned long flags;
82
83 local_irq_save(flags);
84 preempt_disable();
85 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
86 /*
87 * On lockdep we dont want the hand-coded irq-enable of
88 * _raw_spin_lock_flags() code, because lockdep assumes

l *__do_softirq+0xdf
0xc102b5dd is in __do_softirq (kernel/softirq.c:252).
247
248 trace_softirq_exit();
249
250 account_system_vtime(current);
251 _local_bh_enable();
252 }
253
254 #ifndef __ARCH_HAS_DO_SOFTIRQ
255
256 asmlinkage void do_softirq(void)

l *do_softirq+0x68
0xc1006d65 is in do_softirq (arch/i386/kernel/irq.c:222).
217 irqctx->tinfo.previous_esp = current_stack_pointer;
218
219 /* build the stack frame on the softirq stack */
220 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
221
222 asm volatile(
223 " xchgl %%ebx,%%esp \n"
224 " call __do_softirq \n"
225 " movl %%ebx,%%esp \n"
226 : "=b"(isp)


[ 2816.234235] [<c04052ad>] dump_trace+0x63/0x1eb
[ 2816.238888] [<c040544f>] show_trace_log_lvl+0x1a/0x2f
[ 2816.244211] [<c040608d>] show_trace+0x12/0x14
[ 2816.248757] [<c04060a5>] dump_stack+0x16/0x18
[ 2816.253288] [<c041eef1>] __might_sleep+0xce/0xd5
[ 2816.258046] [<c04680b5>] __alloc_pages+0x33/0x324
[ 2816.262968] [<c04683fb>] __get_free_pages+0x55/0x66
[ 2816.268060] [<c0481517>] process_slab+0x1bd/0x299
[ 2816.272988] [<c048164a>] list_locations+0x57/0x26b
[ 2816.277981] [<c0481880>] free_calls_show+0x22/0x29
[ 2816.282965] [<c047e702>] slab_attr_show+0x1c/0x20
[ 2816.287891] [<c04c1bd9>] sysfs_read_file+0x94/0x105
[ 2816.293018] [<c048580b>] vfs_read+0xcf/0x158
[ 2816.297539] [<c0485c71>] sys_read+0x3d/0x72
[ 2816.301910] [<c040420c>] syscall_call+0x7/0xb
[ 2816.306486] [<b7f30410>] 0xb7f30410
[ 2816.310165] =======================
[ 2818.826341] BUG: sleeping function called from invalid context at mm/page_alloc.c:1547
[ 2818.834388] in_atomic():1, irqs_disabled():1
[ 2818.838751] 1 lock held by cat/12635:
[ 2818.842506] #0: (&n->list_lock){++..}, at: [<c0481630>] list_locations+0x3d/0x26b
[ 2818.850460] irq event stamp: 11494
[ 2818.853908] hardirqs last enabled at (11493): [<c042adbd>] on_each_cpu+0x3b/0x71
[ 2818.861505] hardirqs last disabled at (11494): [<c065d241>] _spin_lock_irqsave+0x13/0x6e
[ 2818.869831] softirqs last enabled at (11258): [<c042b5dd>] __do_softirq+0xdf/0xe5
[ 2818.877576] softirqs last disabled at (11215): [<c0406d65>] do_softirq+0x68/0x11f
[ 2818.885217] [<c04052ad>] dump_trace+0x63/0x1eb
[ 2818.889893] [<c040544f>] show_trace_log_lvl+0x1a/0x2f
[ 2818.895112] [<c040608d>] show_trace+0x12/0x14
[ 2818.899667] [<c04060a5>] dump_stack+0x16/0x18
[ 2818.904232] [<c041eef1>] __might_sleep+0xce/0xd5
[ 2818.909046] [<c04680b5>] __alloc_pages+0x33/0x324
[ 2818.913956] [<c04683fb>] __get_free_pages+0x55/0x66
[ 2818.919022] [<c0481517>] process_slab+0x1bd/0x299
[ 2818.923923] [<c048164a>] list_locations+0x57/0x26b
[ 2818.928961] [<c0481880>] free_calls_show+0x22/0x29
[ 2818.933916] [<c047e702>] slab_attr_show+0x1c/0x20
[ 2818.938825] [<c04c1bd9>] sysfs_read_file+0x94/0x105
[ 2818.943900] [<c048580b>] vfs_read+0xcf/0x158
[ 2818.948335] [<c0485c71>] sys_read+0x3d/0x72
[ 2818.952683] [<c040420c>] syscall_call+0x7/0xb
[ 2818.957213] [<b7f82410>] 0xb7f82410
[ 2818.960896] =======================

http://www.stardust.webpages.pl/files/tbf/bitis-gabonica/2.6.22-rc4-mm2-sd3/sd-dmesg2

Regards,
Michal

--
"Najbardziej brakowało mi twojego milczenia."
-- Andrzej Sapkowski "Coś więcej"

2007-06-08 19:39:04

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 00/12] Slab defragmentation V3

On Fri, 8 Jun 2007, Michal Piotrowski wrote:

> 0xc1081630 is in list_locations (mm/slub.c:3388).
> 3383 struct page *page;
> 3384
> 3385 if (!atomic_read(&n->nr_slabs))
> 3386 continue;
> 3387
> 3388 spin_lock_irqsave(&n->list_lock, flags);
> 3389 list_for_each_entry(page, &n->partial, lru)
> 3390 process_slab(&t, s, page, alloc);
> 3391 list_for_each_entry(page, &n->full, lru)
> 3392 process_slab(&t, s, page, alloc);


Yes process slab needs some temporary data to generate the lists of
functions calling etc and that is a GFP_TEMPORARY alloc.

Does this fix it?

---
mm/slub.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

Index: slub/mm/slub.c
===================================================================
--- slub.orig/mm/slub.c 2007-06-08 12:35:56.000000000 -0700
+++ slub/mm/slub.c 2007-06-08 12:37:32.000000000 -0700
@@ -2930,7 +2930,7 @@ static int alloc_loc_track(struct loc_tr

order = get_order(sizeof(struct location) * max);

- l = (void *)__get_free_pages(GFP_TEMPORARY, order);
+ l = (void *)__get_free_pages(GFP_ATOMIC, order);

if (!l)
return 0;

2007-06-08 19:40:27

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 00/12] Slab defragmentation V3

On Fri, 8 Jun 2007, Christoph Lameter wrote:

> On Fri, 8 Jun 2007, Michal Piotrowski wrote:
>
> > Yes, it does. Thanks!
>
> Ahhh... That leds to the discovery more sysfs problems. I need to make
> sure not to be holding locks while calling into sysfs. More cleanup...

Could you remove the trylock patch and see how this one fares? We may need
both but this should avoid taking the slub_lock around any possible alloc
of sysfs.


SLUB: Move sysfs operations outside of slub_lock

Sysfs can do a gazillion things when called. Make sure that we do
not call any sysfs functions while holding the slub_lock. Let sysfs
fend for itself locking wise.

Just protect the essentials: The modifications to the slab lists
and the ref counters of the slabs.

Signed-off-by: Christoph Lameter <[email protected]>

---
mm/slub.c | 34 +++++++++++++++++++++-------------
1 file changed, 21 insertions(+), 13 deletions(-)

Index: slub/mm/slub.c
===================================================================
--- slub.orig/mm/slub.c 2007-06-08 12:21:56.000000000 -0700
+++ slub/mm/slub.c 2007-06-08 12:30:23.000000000 -0700
@@ -2179,12 +2179,13 @@ void kmem_cache_destroy(struct kmem_cach
s->refcount--;
if (!s->refcount) {
list_del(&s->list);
+ up_write(&slub_lock);
if (kmem_cache_close(s))
WARN_ON(1);
sysfs_slab_remove(s);
kfree(s);
- }
- up_write(&slub_lock);
+ } else
+ up_write(&slub_lock);
}
EXPORT_SYMBOL(kmem_cache_destroy);

@@ -2637,26 +2638,33 @@ struct kmem_cache *kmem_cache_create(con
*/
s->objsize = max(s->objsize, (int)size);
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
+ up_write(&slub_lock);
+
if (sysfs_slab_alias(s, name))
goto err;
- } else {
- s = kmalloc(kmem_size, GFP_KERNEL);
- if (s && kmem_cache_open(s, GFP_KERNEL, name,
+
+ return s;
+ }
+
+ s = kmalloc(kmem_size, GFP_KERNEL);
+ if (s) {
+ if (kmem_cache_open(s, GFP_KERNEL, name,
size, align, flags, ctor)) {
- if (sysfs_slab_add(s)) {
- kfree(s);
- goto err;
- }
list_add(&s->list, &slab_caches);
+ up_write(&slub_lock);
raise_kswapd_order(s->order);
- } else
- kfree(s);
+
+ if (sysfs_slab_add(s))
+ goto err;
+
+ return s;
+
+ }
+ kfree(s);
}
up_write(&slub_lock);
- return s;

err:
- up_write(&slub_lock);
if (flags & SLAB_PANIC)
panic("Cannot create slabcache %s\n", name);
else

2007-06-08 19:47:56

by Michal Piotrowski

[permalink] [raw]
Subject: Re: [patch 00/12] Slab defragmentation V3

Christoph Lameter pisze:
> On Fri, 8 Jun 2007, Christoph Lameter wrote:
>
>> On Fri, 8 Jun 2007, Michal Piotrowski wrote:
>>
>>> Yes, it does. Thanks!
>> Ahhh... That leds to the discovery more sysfs problems. I need to make
>> sure not to be holding locks while calling into sysfs. More cleanup...
>
> Could you remove the trylock patch and see how this one fares? We may need
> both but this should avoid taking the slub_lock around any possible alloc
> of sysfs.
>
>

It's a bit tricky

cat ../sd2.patch | patch -p1
patching file mm/slub.c
Hunk #1 succeeded at 2194 (offset 15 lines).
Hunk #2 FAILED at 2653.
1 out of 2 hunks FAILED -- saving rejects to file mm/slub.c.rej
[michal@bitis-gabonica linux-work3]$ cat mm/slub.c.rej
***************
*** 2652,2677 ****
*/
s->objsize = max(s->objsize, (int)size);
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
if (sysfs_slab_alias(s, name))
goto err;
- } else {
- s = kmalloc(kmem_size, GFP_KERNEL);
- if (s && kmem_cache_open(s, GFP_KERNEL, name,
size, align, flags, ctor)) {
- if (sysfs_slab_add(s)) {
- kfree(s);
- goto err;
- }
list_add(&s->list, &slab_caches);
raise_kswapd_order(s->order);
- } else
- kfree(s);
}
up_write(&slub_lock);
- return s;

err:
- up_write(&slub_lock);
if (flags & SLAB_PANIC)
panic("Cannot create slabcache %s\n", name);
else
--- 2653,2685 ----
*/
s->objsize = max(s->objsize, (int)size);
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
+ up_write(&slub_lock);
+
if (sysfs_slab_alias(s, name))
goto err;
+
+ return s;
+ }
+
+ s = kmalloc(kmem_size, GFP_KERNEL);
+ if (s) {
+ if (kmem_cache_open(s, GFP_KERNEL, name,
size, align, flags, ctor)) {
list_add(&s->list, &slab_caches);
+ up_write(&slub_lock);
raise_kswapd_order(s->order);
+
+ if (sysfs_slab_add(s))
+ goto err;
+
+ return s;
+
+ }
+ kfree(s);
}
up_write(&slub_lock);

err:
if (flags & SLAB_PANIC)
panic("Cannot create slabcache %s\n", name);
else

Regards,
Michal

--
"Najbardziej brakowało mi twojego milczenia."
-- Andrzej Sapkowski "Coś więcej"

2007-06-08 20:48:57

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 00/12] Slab defragmentation V3

On Fri, 8 Jun 2007, Michal Piotrowski wrote:

> > Could you remove the trylock patch and see how this one fares? We may need
> > both but this should avoid taking the slub_lock around any possible alloc of
> > sysfs.
> It's a bit tricky

Hmmm... Yes that version was aginst 4-mm1 instead after the defrag
patchset. The difference is only the "ops" parameter...

Rediff to apply after defrag patchset.

SLUB: Move sysfs operations outside of slub_lock

Sysfs can do a gazillion things when called. Make sure that we do
not call any sysfs functions while holding the slub_lock. Let sysfs
fend for itself locking wise.

Just protect the essentials: The modifications to the slab lists
and the ref counters of the slabs.

Signed-off-by: Christoph Lameter <[email protected]>

---
mm/slub.c | 34 +++++++++++++++++++++-------------
1 file changed, 21 insertions(+), 13 deletions(-)

Index: slub/mm/slub.c
===================================================================
--- slub.orig/mm/slub.c 2007-06-08 13:47:32.000000000 -0700
+++ slub/mm/slub.c 2007-06-08 13:48:07.000000000 -0700
@@ -2193,12 +2193,13 @@ void kmem_cache_destroy(struct kmem_cach
s->refcount--;
if (!s->refcount) {
list_del(&s->list);
+ up_write(&slub_lock);
if (kmem_cache_close(s))
WARN_ON(1);
sysfs_slab_remove(s);
kfree(s);
- }
- up_write(&slub_lock);
+ } else
+ up_write(&slub_lock);
}
EXPORT_SYMBOL(kmem_cache_destroy);

@@ -2956,26 +2957,33 @@ struct kmem_cache *kmem_cache_create(con
*/
s->objsize = max(s->objsize, (int)size);
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
+ up_write(&slub_lock);
+
if (sysfs_slab_alias(s, name))
goto err;
- } else {
- s = kmalloc(kmem_size, GFP_KERNEL);
- if (s && kmem_cache_open(s, GFP_KERNEL, name,
+
+ return s;
+ }
+
+ s = kmalloc(kmem_size, GFP_KERNEL);
+ if (s) {
+ if (kmem_cache_open(s, GFP_KERNEL, name,
size, align, flags, ctor, ops)) {
- if (sysfs_slab_add(s)) {
- kfree(s);
- goto err;
- }
list_add(&s->list, &slab_caches);
+ up_write(&slub_lock);
raise_kswapd_order(s->order);
- } else
- kfree(s);
+
+ if (sysfs_slab_add(s))
+ goto err;
+
+ return s;
+
+ }
+ kfree(s);
}
up_write(&slub_lock);
- return s;

err:
- up_write(&slub_lock);
if (flags & SLAB_PANIC)
panic("Cannot create slabcache %s\n", name);
else