2001-12-16 10:44:14

by Ville Herva

[permalink] [raw]
Subject: Allocating 1GB on a 2GB ia64 box trigger oom rambo

On an ia64 box with 2GB memory, 2.4.16+ia64-patch, the OOM rambo is too
hungry for blood:

./memstream2 512
testing mem stream, block size 536870912...

zsh: killed ./memstream2 512

dmesg: Out of Memory: Killed process 28142 (memstream2).


vherva@whale:/home/vherva/scratch>free
total used free shared buffers cached
Mem: 2057888 1353744 704144 0 63904 1100480
-/+ buffers/cache: 189360 1868528
Swap: 255984 166528 89456

vherva@whale:/home/vherva/scratch>cat /proc/slabinfo
slabinfo - version: 1.1 (SMP)
kmem_cache 111 111 432 3 3 1 : 124 62
ip_fib_hash 10 454 32 1 1 1 : 252 126
urb_priv 1 123 128 1 1 1 : 252 126
journal_head 255 354 88 2 2 1 : 252 126
revoke_table 2 816 16 1 1 1 : 252 126
revoke_record 0 0 64 0 0 1 : 252 126
clip_arp_cache 0 0 256 0 0 1 : 252 126
ip_mrt_cache 0 0 128 0 0 1 : 252 126
tcp_tw_bucket 0 0 128 0 0 1 : 252 126
tcp_bind_bucket 20 240 64 1 1 1 : 252 126
tcp_open_request 0 0 128 0 0 1 : 252 126
inet_peer_cache 1 123 128 1 1 1 : 252 126
ip_dst_cache 49 310 256 5 5 1 : 252 126
arp_cache 6 166 192 2 2 1 : 252 126
blkdev_requests 512 581 192 7 7 1 : 252 126
dnotify cache 0 0 40 0 0 1 : 252 126
file lock cache 2 198 160 2 2 1 : 252 126
fasync cache 1 582 24 1 1 1 : 252 126
uid_cache 2 240 64 1 1 1 : 252 126
skbuff_head_cache 439 496 256 8 8 1 : 252 126
sock 230 230 1600 23 23 1 : 60 30
sigqueue 222 348 136 3 3 1 : 252 126
cdev_cache 276 720 64 3 3 1 : 252 126
bdev_cache 6 246 128 2 2 1 : 252 126
mnt_cache 18 246 128 2 2 1 : 252 126
inode_cache 1862 7623 768 363 363 1 : 124 62
dentry_cache 1084 7968 192 96 96 1 : 252 126
dquot 0 0 192 0 0 1 : 252 126
filp 1941 1992 192 24 24 1 : 252 126
names_cache 8 8 4096 2 2 1 : 60 30
buffer_head 289081 360386 192 4339 4342 1 : 252 126
mm_struct 313 372 256 6 6 1 : 252 126
vm_area_struct 4391 4565 192 55 55 1 : 252 126
fs_cache 313 480 64 2 2 1 : 252 126
files_cache 133 133 832 7 7 1 : 124 62
signal_act 110 110 1600 11 11 1 : 60 30
size-131072(DMA) 0 0 131072 0 0 8 : 0 0
size-131072 0 0 131072 0 0 8 : 0 0
size-65536(DMA) 0 0 65536 0 0 4 : 0 0
size-65536 0 0 65536 0 0 4 : 0 0
size-32768(DMA) 0 0 32768 0 0 2 : 0 0
size-32768 3 3 32768 3 3 2 : 0 0
size-16384(DMA) 0 0 16384 0 0 1 : 60 30
size-16384 1 1 16384 1 1 1 : 60 30
size-8192(DMA) 0 0 8192 0 0 1 : 60 30
size-8192 38 68 8192 19 34 1 : 60 30
size-4096(DMA) 0 0 4096 0 0 1 : 60 30
size-4096 220 220 4096 55 55 1 : 60 30
size-2048(DMA) 0 0 2048 0 0 1 : 60 30
size-2048 232 232 2048 29 29 1 : 60 30
size-1024(DMA) 0 0 1024 0 0 1 : 124 62
size-1024 720 720 1024 48 48 1 : 124 62
size-512(DMA) 0 0 512 0 0 1 : 124 62
size-512 186 186 512 6 6 1 : 124 62
size-256(DMA) 0 0 256 0 0 1 : 252 126
size-256 2528 2604 256 42 42 1 : 252 126
size-128(DMA) 1 123 128 1 1 1 : 252 126
size-128 448 738 128 6 6 1 : 252 126
size-64(DMA) 7 240 64 1 1 1 : 252 126
size-64 2442 7200 64 30 30 1 : 252 126



vherva@whale:/home/vherva/scratch>cat memstream2.c
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>

#define KB (1024)
#define MB (1024 * KB)
#define GB (1024 * MB)

inline unsigned chksum(char* buffer, int len)
{
int i;
unsigned c = 0;
unsigned* ptr;

for (ptr = (unsigned*)buffer; ptr < (unsigned*)&buffer[len]; ptr++)
c += *ptr;

return c;
}

int main(int argc, char** argv)
{
unsigned long blocksz = 64*MB;
unsigned long long streamed = 0, last = 0;
unsigned iter = 0;
unsigned checksum, orig_checksum = 0;
const unsigned long long qgiga = 1024 * 1024 * 1024 / 4;
time_t t = time(NULL);

char *buffer1, *buffer2;
int i;

if (argc > 1) blocksz = atoi(argv[1]) * MB;

fprintf(stderr, "testing mem stream, block size %u...\n\n ", blocksz);

buffer1 = malloc(blocksz + 256);
buffer2 = malloc(blocksz + 256);
if (!buffer1 || !buffer2)
{
perror("malloc failed");
exit(1);
}
for (i = 0; i < blocksz; i++)
buffer1[i] = (i + rand()) % 256;

orig_checksum = chksum(buffer1, blocksz);
memcpy(buffer2, buffer1, blocksz);

while (1)
{
int times = 8;
unsigned checksum1, checksum2;
for (i = 0; i < times; i++)
{
checksum1 = chksum(buffer1, blocksz);
checksum2 = chksum(buffer2, blocksz);

if (orig_checksum != checksum2 || checksum1 != checksum2)
{
fprintf(stderr, "Blocks differ! %x, %x\n", checksum, orig_checksum);
fprintf(stderr, "Read the blocks %i time (streamed %llu bytes.)\n", (iter * times + i * 2), streamed + blocksz * i * 2);
exit(-1);
}
}

streamed += blocksz * times * 2;

if (streamed - last > qgiga)
{

fprintf(stderr, "\010\010\010\010\010\010\010\010\010\010.%6.1fMB/s",
(float)(streamed - last) / (float)(time(NULL) - t) / (float)MB);
t = time(NULL);
last = streamed;
}

iter++;
}

return 0;
}


I tried to reproduce with a simpler program:

#include <stdio.h>
#include <stdlib.h>

#define BKSP "\010\010\010\010\010\010"

int main(int argc, char** argv)
{
unsigned long megs = 512;
unsigned long size, i;
unsigned char* buf;
if (argc > 1) megs = atol(argv[1]);
size = megs * 1024 * 1024;

fprintf(stderr, "Allocating %lu megs...\n\n ", megs);
buf = malloc(size);

for (i = 0; i < size; i++)
{
buf[i] = 42;

if ((i + 1) % (1024 * 1024) == 0)
fprintf(stderr, BKSP "%4uMB", (i + 1) / 1024 / 1024);
}

fprintf(stderr, "\n Success.\n");

return 1;
}

I didn't get killed even with 1536 yet, but it basically halts at 1200+
megs:

./alloc 1536
Allocating 1536 megs...

1263MB
<hangs there making very little progress>

free
total used free shared buffers cached
Mem: 2057888 2051280 6608 0 63280 665296
-/+ buffers/cache: 1322704 735184
Swap: 255984 255984 0

>From top (after ten minutes):
5 root 12 0 0 0 0 SW 88.9 0.0 5:29 kswapd
28312 vherva 20 0 1385M 1.3G 784 R 88.2 16.0 4:21 alloc


So again, releaseing cache seems to be the culprit.

I can try -aa if you think it's worth it.


-- v --

[email protected]


2001-12-16 17:14:10

by Ville Herva

[permalink] [raw]
Subject: Re: malloc 1GB on a 2GB ia64 box fails - 17rc1 woes w/ qla1280 and reiserfs

I spent a good while trying to reproduce this on 17rc1 (never got as far as
to try -aa), but I never got it booting.

I did get the rejects weeded (-rc1 and ia64-011214 didn't go together
cleanly), although there were some stuff in ptrace.c which I'm really no too
sure of.

It didn't boot, though. qla1280 just hung after "verifying chip" phase.
Strangely, I don't see any changes to qla1280.c in -rc1.

Also, the 2.4.16 kernel I'm using now has gone through a lot of unclean
reboots (usb shutdown used to hang hard until I just disabled the whole
thing.) It had been mostly ok, but after one unclean boot reiserfs got into
state where attempt to mount it crashed it in the next boot. reiserfsck -x
-o -i fixed it, but I think it's still nasty. This was with 2.4.16 (17rc1
never booted thus far).

I hope you can reproduce it on a never kernel (17rc1 or -aa) - it should be
easy, just fill the cache (find / -type f -exec cat {} \; > /dev/null) and
run the test prog.


-- v --

[email protected]

2001-12-17 07:06:32

by Ville Herva

[permalink] [raw]
Subject: Ia64 unaligned accesses in ntfs driver

I get unaligned accesses from these addresses:

kernel unaligned access to 0xe00000006fb49719, ip=0xa000000000265050

from ksymoops:
Adhoc a000000000265050 <[ntfs]ntfs_decompress+d0/320>
Adhoc a000000000262d80 <[ntfs]ntfs_decompress_run+2a0/3c0>
Adhoc a000000000262ba0 <[ntfs]ntfs_decompress_run+c0/3c0>
Adhoc a000000000262d60 <[ntfs]ntfs_decompress_run+280/3c0>

Are these dangerous? I gather IA64 port has some kind of handler for these,
since they don't oops.


-- v --

[email protected]

2001-12-17 07:03:42

by Ville Herva

[permalink] [raw]
Subject: .17-rc1 - oom killer still sneaks in

On Sun, Dec 16, 2001 at 07:13:25PM +0200, you [Ville Herva] claimed:
> I spent a good while trying to reproduce this on 17rc1 (never got as far as
> to try -aa), but I never got it booting.

Ok, I got 17-rc1 to boot (thanks to Matt Domsch). What happens is this:


uname -a
Linux whale.viasys.com 2.4.17-rc1 #4 SMP Sun Dec 16 17:42:51 EET 2001 ia64 unknown

## Fille the cache
find / -type f -exec cat {} \; > /dev/null
updatedb


total used free shared buffers cached
Mem: 2057536 2045744 11792 0 165120 1420112
-/+ buffers/cache: 460512 1597024
Swap: 255984 2064 253920

slabinfo - version: 1.1 (SMP)
kmem_cache 111 111 432 3 3 1 : 124 62
ip_fib_hash 10 454 32 1 1 1 : 252 126
devfsd_event 1 454 32 1 1 1 : 252 126
clip_arp_cache 0 0 256 0 0 1 : 252 126
ip_mrt_cache 0 0 128 0 0 1 : 252 126
tcp_tw_bucket 0 0 128 0 0 1 : 252 126
tcp_bind_bucket 36 240 64 1 1 1 : 252 126
tcp_open_request 0 0 128 0 0 1 : 252 126
inet_peer_cache 1 123 128 1 1 1 : 252 126
ip_dst_cache 186 186 256 3 3 1 : 252 126
arp_cache 3 166 192 2 2 1 : 252 126
blkdev_requests 512 581 192 7 7 1 : 252 126
journal_head 210 2655 88 12 15 1 : 252 126
revoke_table 1 816 16 1 1 1 : 252 126
revoke_record 0 0 64 0 0 1 : 252 126
dnotify cache 0 0 40 0 0 1 : 252 126
file lock cache 99 198 160 1 2 1 : 252 126
fasync cache 1 582 24 1 1 1 : 252 126
uid_cache 2 240 64 1 1 1 : 252 126
skbuff_head_cache 370 496 256 8 8 1 : 252 126
sock 220 220 1600 22 22 1 : 60 30
sigqueue 290 348 136 3 3 1 : 252 126
cdev_cache 152 480 64 2 2 1 : 252 126
bdev_cache 5 246 128 2 2 1 : 252 126
mnt_cache 16 246 128 2 2 1 : 252 126
inode_cache 169008 169008 768 8048 8048 1 : 124 62
dentry_cache 170316 170316 192 2052 2052 1 : 252 126
dquot 0 0 192 0 0 1 : 252 126
filp 1579 1660 192 20 20 1 : 252 126
names_cache 16 16 4096 4 4 1 : 60 30
buffer_head 393459 394333 192 4746 4751 1 : 252 126
mm_struct 186 186 256 3 3 1 : 252 126
vm_area_struct 3609 3735 192 45 45 1 : 252 126
fs_cache 240 240 64 1 1 1 : 252 126
files_cache 133 133 832 7 7 1 : 124 62
signal_act 90 90 1600 9 9 1 : 60 30


vherva@whale:/home/vherva/scratch>./alloc 1700
Allocating 1700 megs...

1615MB
zsh: killed ./alloc 1700
dmesg: Out of memory: killed process 10298 (alloc)

It got there pretty quick - .16 spent ~15 minutes in around 1200MB grinding
in kswapd.

total used free shared buffers cached
Mem: 2057536 513184 1544352 0 366224 20096
-/+ buffers/cache: 126864 1930672
Swap: 255984 141728 114256

slabinfo - version: 1.1 (SMP)
kmem_cache 111 111 432 3 3 1 : 124 62
ip_fib_hash 10 454 32 1 1 1 : 252 126
devfsd_event 0 0 32 0 0 1 : 252 126
clip_arp_cache 0 0 256 0 0 1 : 252 126
ip_mrt_cache 0 0 128 0 0 1 : 252 126
tcp_tw_bucket 0 0 128 0 0 1 : 252 126
tcp_bind_bucket 28 240 64 1 1 1 : 252 126
tcp_open_request 0 0 128 0 0 1 : 252 126
inet_peer_cache 2 123 128 1 1 1 : 252 126
ip_dst_cache 48 248 256 4 4 1 : 252 126
arp_cache 5 83 192 1 1 1 : 252 126
blkdev_requests 512 581 192 7 7 1 : 252 126
journal_head 146 885 88 5 5 1 : 252 126
revoke_table 1 816 16 1 1 1 : 252 126
revoke_record 0 0 64 0 0 1 : 252 126
dnotify cache 0 0 40 0 0 1 : 252 126
file lock cache 2 99 160 1 1 1 : 252 126
fasync cache 1 582 24 1 1 1 : 252 126
uid_cache 2 240 64 1 1 1 : 252 126
skbuff_head_cache 336 496 256 8 8 1 : 252 126
sock 210 210 1600 21 21 1 : 60 30
sigqueue 222 348 136 3 3 1 : 252 126
cdev_cache 273 480 64 2 2 1 : 252 126
bdev_cache 5 246 128 2 2 1 : 252 126
mnt_cache 16 246 128 2 2 1 : 252 126
inode_cache 910 3213 768 153 153 1 : 124 62
dentry_cache 953 6723 192 81 81 1 : 252 126
dquot 0 0 192 0 0 1 : 252 126
filp 1632 1826 192 22 22 1 : 252 126
names_cache 8 8 4096 2 2 1 : 60 30
buffer_head 93108 158613 192 1904 1911 1 : 252 126
mm_struct 181 186 256 3 3 1 : 252 126
vm_area_struct 3692 3818 192 46 46 1 : 252 126
fs_cache 181 480 64 2 2 1 : 252 126
files_cache 114 114 832 6 6 1 : 124 62
signal_act 90 90 1600 9 9 1 : 60 30


So it seems reluctant to give up buffers.

Much better than .16, though, and it readily frees icache and dcache.

Want me to try -aa?


-- v --

[email protected]

2001-12-17 09:47:14

by Anton Altaparmakov

[permalink] [raw]
Subject: Re: Ia64 unaligned accesses in ntfs driver

At 07:05 17/12/01, Ville Herva wrote:
>I get unaligned accesses from these addresses:
>
>kernel unaligned access to 0xe00000006fb49719, ip=0xa000000000265050
>
>from ksymoops:
>Adhoc a000000000265050 <[ntfs]ntfs_decompress+d0/320>
>Adhoc a000000000262d80 <[ntfs]ntfs_decompress_run+2a0/3c0>
>Adhoc a000000000262ba0 <[ntfs]ntfs_decompress_run+c0/3c0>
>Adhoc a000000000262d60 <[ntfs]ntfs_decompress_run+280/3c0>
>
>Are these dangerous? I gather IA64 port has some kind of handler for these,
>since they don't oops.

They are at least one of the explanations why the driver would not work on
non-intel arch... I gather most other arch don't cope with unaligned
accesses. I am surprised those are the only ones you see actually...

This particular function is not implemented correctly anyway - it will not
work on BE arch for example (despite all the endian conversion functions,
some of which are wrong AFAIK).

The changes to make the driver clean are too complex and I am not going to
bother considering the replacement ntfs driver (ntfs tng available from
linux-ntfs cvs on sourceforge) is close to being ready for inclusion into
2.5.x (as soon as read support is completed I will submit it, probably
sometime in January). If anyone wants to work on the old driver I am happy
to take patches. (-;

The new driver should be completely endianness clean and any unaligned
accesses will be dealt with as they are identified. I know of a few
possible ones which I will need to verify and wrap in the get unaligned
macros before release. But for tracking down the rest I will need people to
test the driver as I don't have access to any non-ia32 arch to test on
myself... I don't think there will be many though as most structures in
ntfs have nice alignment guarantees. - The mapping pairs array being a
notable difference which is the source of the unaligned accesses you
report. The new driver handles them correctly by working on a byte-by-byte
basis instead of doing multi-byte accesses, which is the correct way to
decompress the mapping pairs array.

Best regards,

Anton


--
"I've not lost my mind. It's backed up on tape somewhere." - Unknown
--
Anton Altaparmakov <aia21 at cam.ac.uk> (replace at with @)
Linux NTFS Maintainer / WWW: http://linux-ntfs.sf.net/
ICQ: 8561279 / WWW: http://www-stu.christs.cam.ac.uk/~aia21/

2001-12-17 10:32:33

by Ville Herva

[permalink] [raw]
Subject: Re: Ia64 unaligned accesses in ntfs driver

On Mon, Dec 17, 2001 at 09:47:08AM +0000, you [Anton Altaparmakov] claimed:
> At 07:05 17/12/01, Ville Herva wrote:
> >I get unaligned accesses from these addresses:
> >
> >kernel unaligned access to 0xe00000006fb49719, ip=0xa000000000265050
> >
> >from ksymoops:
> >Adhoc a000000000265050 <[ntfs]ntfs_decompress+d0/320>
> >Adhoc a000000000262d80 <[ntfs]ntfs_decompress_run+2a0/3c0>
> >Adhoc a000000000262ba0 <[ntfs]ntfs_decompress_run+c0/3c0>
> >Adhoc a000000000262d60 <[ntfs]ntfs_decompress_run+280/3c0>
> >
> >Are these dangerous? I gather IA64 port has some kind of handler for these,
> >since they don't oops.
>
> They are at least one of the explanations why the driver would not work on
> non-intel arch...

It does work (I _was_ surprised) on IA64. I can read the one ntfs partition
quite well.

> I gather most other arch don't cope with unaligned accesses. I am
> surprised those are the only ones you see actually...

They are not the only ones, I haven't tracked all the entries in dmesg.

> This particular function is not implemented correctly anyway - it will not
> work on BE arch for example (despite all the endian conversion functions,
> some of which are wrong AFAIK).

I see.

> The changes to make the driver clean are too complex and I am not going to
> bother considering the replacement ntfs driver (ntfs tng available from
> linux-ntfs cvs on sourceforge) is close to being ready for inclusion into
> 2.5.x (as soon as read support is completed I will submit it, probably
> sometime in January). If anyone wants to work on the old driver I am happy
> to take patches. (-;

Ok. I can give the new driver a shot on IA64 sometime, if I find the time.



-- v --

[email protected]

2001-12-17 21:38:01

by H. Peter Anvin

[permalink] [raw]
Subject: Re: Ia64 unaligned accesses in ntfs driver

Followup to: <[email protected]>
By author: Anton Altaparmakov <[email protected]>
In newsgroup: linux.dev.kernel
>
> They are at least one of the explanations why the driver would not work on
> non-intel arch... I gather most other arch don't cope with unaligned
> accesses. I am surprised those are the only ones you see actually...
>

A lot of arch's that don't do, however, support emulating it via a
trap handler.

-hpa
--
<[email protected]> at work, <[email protected]> in private!
"Unix gives you enough rope to shoot yourself in the foot."
http://www.zytor.com/~hpa/puzzle.txt <[email protected]>