It caused by rpc_restart_call_prepare with an uninitialized task
for the pnfs do I/O locally without sending any RPC to MDS.
Some debug messages,
[ 1004.001842] bl_read_pagelist enter nr_pages 1 offset 2048 count 2048
[ 1004.002110] bl_read_pagelist: pg_offset 2048
[ 1004.002370] bl_read_pagelist: pg_len 2048 is_dio
[ 1004.002617] bl_read_pagelist: pg_len 2048 after do_add_page_to_bio
[ 1004.002853] bl_read_pagelist: 2048 4096 "(isect << SECTOR_SHIFT) < header->inode->i_size"
[ 1004.003774] NFS: nfs_pgio_result: 0, (status 0), tk_ops (null)
[ 1004.003989] --> nfs4_read_done
[ 1004.004224] nfs_readpage_done: 0
[ 1004.004459] nfs_pgio_result: 0
[ 1004.004691] nfs_readpage_result: eof 0, res.count 4096, args.count 2048
[ 1004.004926] nfs_readpage_retry: tk_ops (null)
Panic messages as,
[ 1004.005170] BUG: unable to handle kernel NULL pointer dereference at (null)
[ 1004.005452] IP: [<ffffffffa0075f8a>] rpc_restart_call_prepare+0x2a/0x50 [sunrpc]
[ 1004.005702] PGD 0
[ 1004.005937] Oops: 0000 [#1]
[ 1004.006175] Modules linked in: blocklayoutdriver(OE) nfsv4(OE) nfs(OE) fscache(E) xfs libcrc32c btrfs coretemp crct10dif_pclmul ppdev crc32_pclmul crc32c_intel ghash_clmulni_intel vmw_balloon vmw_vmci parport_pc parport nfsd(OE) shpchp xor raid6_pq i2c_piix4 auth_rpcgss nfs_acl lockd(E) grace sunrpc(E) vmwgfx drm_kms_helper ttm drm serio_raw e1000 mptspi scsi_transport_spi mptscsih ata_generic mptbase pata_acpi [last unloaded: fscache]
[ 1004.007611] CPU: 0 PID: 3489 Comm: kworker/0:2 Tainted: G OE 4.3.0-rc1+ #252
[ 1004.007920] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/20/2014
[ 1004.008571] Workqueue: events bl_read_cleanup [blocklayoutdriver]
[ 1004.008917] task: ffff88006ceab080 ti: ffff880017700000 task.ti: ffff880017700000
[ 1004.009315] RIP: 0010:[<ffffffffa0075f8a>] [<ffffffffa0075f8a>] rpc_restart_call_prepare+0x2a/0x50 [sunrpc]
[ 1004.010152] RSP: 0018:ffff880017703cc8 EFLAGS: 00010246
[ 1004.010589] RAX: 0000000000000000 RBX: ffff880017726000 RCX: 0000000000000006
[ 1004.011007] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff8800177260d8
[ 1004.011428] RBP: ffff880017703cc8 R08: 0000000000000001 R09: 0000000000000000
[ 1004.011831] R10: 0000000000000000 R11: 0000000000000001 R12: ffff8800177260d8
[ 1004.012237] R13: ffff8800686008b0 R14: 0000000000000000 R15: ffff880017726160
[ 1004.012666] FS: 0000000000000000(0000) GS:ffffffff81c29000(0000) knlGS:0000000000000000
[ 1004.013478] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1004.013930] CR2: 0000000000000000 CR3: 000000006ccbe000 CR4: 00000000001406f0
[ 1004.014592] Stack:
[ 1004.015103] ffff880017703cf0 ffffffffa04c436e ffff8800177260d8 ffff880017726000
[ 1004.015611] ffff8800686008b0 ffff880017703d18 ffffffffa04c2fb8 ffff880017726160
[ 1004.016105] ffff880017726000 ffff88007ff43100 ffff880017703d40 ffffffffa05349c4
[ 1004.016565] Call Trace:
[ 1004.017071] [<ffffffffa04c436e>] nfs_readpage_result+0x11e/0x130 [nfs]
[ 1004.017546] [<ffffffffa04c2fb8>] nfs_pgio_result+0x88/0xa0 [nfs]
[ 1004.018009] [<ffffffffa05349c4>] pnfs_ld_read_done+0x44/0xf0 [nfsv4]
[ 1004.018469] [<ffffffffa04a8532>] bl_read_cleanup+0x22/0x50 [blocklayoutdriver]
[ 1004.018938] [<ffffffff810a388c>] process_one_work+0x21c/0x4c0
[ 1004.019406] [<ffffffff810a37dd>] ? process_one_work+0x16d/0x4c0
[ 1004.019876] [<ffffffff810a3b7a>] worker_thread+0x4a/0x440
[ 1004.020339] [<ffffffff810a3b30>] ? process_one_work+0x4c0/0x4c0
[ 1004.020795] [<ffffffff810a3b30>] ? process_one_work+0x4c0/0x4c0
[ 1004.021289] [<ffffffff810a8d85>] kthread+0xf5/0x110
[ 1004.021735] [<ffffffff810a8c90>] ? kthread_create_on_node+0x240/0x240
[ 1004.022177] [<ffffffff8172cd1f>] ret_from_fork+0x3f/0x70
[ 1004.022604] [<ffffffff810a8c90>] ? kthread_create_on_node+0x240/0x240
[ 1004.023025] Code: 00 0f 1f 44 00 00 31 c0 f6 87 e9 00 00 00 01 55 48 89 e5 75 29 48 8b 47 58 48 c7 47 50 80 42 07 a0 c7 87 e4 00 00 00 00 00 00 00 <48> 83 38 00 74 0f 48 c7 47 50 b0 f1 07 a0 b8 01 00 00 00 5d c3
[ 1004.024344] RIP [<ffffffffa0075f8a>] rpc_restart_call_prepare+0x2a/0x50 [sunrpc]
[ 1004.024773] RSP <ffff880017703cc8>
[ 1004.025228] CR2: 0000000000000000
ping ...
What's your opinion about this problem ?
If read/write of block layout file with bad length (res.count != arg.count),
should nfs retry? NFS try to call rpc_restart_call_prepare() right now,
that cause a panic with uninitialized task.
thanks,
Kinglong Mee
On 9/21/2015 11:22, Kinglong Mee wrote:
> It caused by rpc_restart_call_prepare with an uninitialized task
> for the pnfs do I/O locally without sending any RPC to MDS.
>
> Some debug messages,
>
> [ 1004.001842] bl_read_pagelist enter nr_pages 1 offset 2048 count 2048
> [ 1004.002110] bl_read_pagelist: pg_offset 2048
> [ 1004.002370] bl_read_pagelist: pg_len 2048 is_dio
> [ 1004.002617] bl_read_pagelist: pg_len 2048 after do_add_page_to_bio
> [ 1004.002853] bl_read_pagelist: 2048 4096 "(isect << SECTOR_SHIFT) < header->inode->i_size"
> [ 1004.003774] NFS: nfs_pgio_result: 0, (status 0), tk_ops (null)
> [ 1004.003989] --> nfs4_read_done
> [ 1004.004224] nfs_readpage_done: 0
> [ 1004.004459] nfs_pgio_result: 0
> [ 1004.004691] nfs_readpage_result: eof 0, res.count 4096, args.count 2048
> [ 1004.004926] nfs_readpage_retry: tk_ops (null)
>
> Panic messages as,
>
> [ 1004.005170] BUG: unable to handle kernel NULL pointer dereference at (null)
> [ 1004.005452] IP: [<ffffffffa0075f8a>] rpc_restart_call_prepare+0x2a/0x50 [sunrpc]
> [ 1004.005702] PGD 0
> [ 1004.005937] Oops: 0000 [#1]
> [ 1004.006175] Modules linked in: blocklayoutdriver(OE) nfsv4(OE) nfs(OE) fscache(E) xfs libcrc32c btrfs coretemp crct10dif_pclmul ppdev crc32_pclmul crc32c_intel ghash_clmulni_intel vmw_balloon vmw_vmci parport_pc parport nfsd(OE) shpchp xor raid6_pq i2c_piix4 auth_rpcgss nfs_acl lockd(E) grace sunrpc(E) vmwgfx drm_kms_helper ttm drm serio_raw e1000 mptspi scsi_transport_spi mptscsih ata_generic mptbase pata_acpi [last unloaded: fscache]
> [ 1004.007611] CPU: 0 PID: 3489 Comm: kworker/0:2 Tainted: G OE 4.3.0-rc1+ #252
> [ 1004.007920] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/20/2014
> [ 1004.008571] Workqueue: events bl_read_cleanup [blocklayoutdriver]
> [ 1004.008917] task: ffff88006ceab080 ti: ffff880017700000 task.ti: ffff880017700000
> [ 1004.009315] RIP: 0010:[<ffffffffa0075f8a>] [<ffffffffa0075f8a>] rpc_restart_call_prepare+0x2a/0x50 [sunrpc]
> [ 1004.010152] RSP: 0018:ffff880017703cc8 EFLAGS: 00010246
> [ 1004.010589] RAX: 0000000000000000 RBX: ffff880017726000 RCX: 0000000000000006
> [ 1004.011007] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff8800177260d8
> [ 1004.011428] RBP: ffff880017703cc8 R08: 0000000000000001 R09: 0000000000000000
> [ 1004.011831] R10: 0000000000000000 R11: 0000000000000001 R12: ffff8800177260d8
> [ 1004.012237] R13: ffff8800686008b0 R14: 0000000000000000 R15: ffff880017726160
> [ 1004.012666] FS: 0000000000000000(0000) GS:ffffffff81c29000(0000) knlGS:0000000000000000
> [ 1004.013478] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 1004.013930] CR2: 0000000000000000 CR3: 000000006ccbe000 CR4: 00000000001406f0
> [ 1004.014592] Stack:
> [ 1004.015103] ffff880017703cf0 ffffffffa04c436e ffff8800177260d8 ffff880017726000
> [ 1004.015611] ffff8800686008b0 ffff880017703d18 ffffffffa04c2fb8 ffff880017726160
> [ 1004.016105] ffff880017726000 ffff88007ff43100 ffff880017703d40 ffffffffa05349c4
> [ 1004.016565] Call Trace:
> [ 1004.017071] [<ffffffffa04c436e>] nfs_readpage_result+0x11e/0x130 [nfs]
> [ 1004.017546] [<ffffffffa04c2fb8>] nfs_pgio_result+0x88/0xa0 [nfs]
> [ 1004.018009] [<ffffffffa05349c4>] pnfs_ld_read_done+0x44/0xf0 [nfsv4]
> [ 1004.018469] [<ffffffffa04a8532>] bl_read_cleanup+0x22/0x50 [blocklayoutdriver]
> [ 1004.018938] [<ffffffff810a388c>] process_one_work+0x21c/0x4c0
> [ 1004.019406] [<ffffffff810a37dd>] ? process_one_work+0x16d/0x4c0
> [ 1004.019876] [<ffffffff810a3b7a>] worker_thread+0x4a/0x440
> [ 1004.020339] [<ffffffff810a3b30>] ? process_one_work+0x4c0/0x4c0
> [ 1004.020795] [<ffffffff810a3b30>] ? process_one_work+0x4c0/0x4c0
> [ 1004.021289] [<ffffffff810a8d85>] kthread+0xf5/0x110
> [ 1004.021735] [<ffffffff810a8c90>] ? kthread_create_on_node+0x240/0x240
> [ 1004.022177] [<ffffffff8172cd1f>] ret_from_fork+0x3f/0x70
> [ 1004.022604] [<ffffffff810a8c90>] ? kthread_create_on_node+0x240/0x240
> [ 1004.023025] Code: 00 0f 1f 44 00 00 31 c0 f6 87 e9 00 00 00 01 55 48 89 e5 75 29 48 8b 47 58 48 c7 47 50 80 42 07 a0 c7 87 e4 00 00 00 00 00 00 00 <48> 83 38 00 74 0f 48 c7 47 50 b0 f1 07 a0 b8 01 00 00 00 5d c3
> [ 1004.024344] RIP [<ffffffffa0075f8a>] rpc_restart_call_prepare+0x2a/0x50 [sunrpc]
> [ 1004.024773] RSP <ffff880017703cc8>
> [ 1004.025228] CR2: 0000000000000000
>
On Tue, Oct 13, 2015 at 8:45 AM, Kinglong Mee <[email protected]> wrote:
> ping ...
>
> What's your opinion about this problem ?
>
> If read/write of block layout file with bad length (res.count != arg.count),
> should nfs retry? NFS try to call rpc_restart_call_prepare() right now,
> that cause a panic with uninitialized task.
The client should not be attempting to read more data than what was
requested by the O_DIRECT read request. It should be strictly
respecting the boundaries of the user buffer that was supplied. Any
idea why this is happening?
>
> thanks,
> Kinglong Mee
>
> On 9/21/2015 11:22, Kinglong Mee wrote:
>> It caused by rpc_restart_call_prepare with an uninitialized task
>> for the pnfs do I/O locally without sending any RPC to MDS.
>>
>> Some debug messages,
>>
>> [ 1004.001842] bl_read_pagelist enter nr_pages 1 offset 2048 count 2048
>> [ 1004.002110] bl_read_pagelist: pg_offset 2048
>> [ 1004.002370] bl_read_pagelist: pg_len 2048 is_dio
>> [ 1004.002617] bl_read_pagelist: pg_len 2048 after do_add_page_to_bio
>> [ 1004.002853] bl_read_pagelist: 2048 4096 "(isect << SECTOR_SHIFT) < header->inode->i_size"
>> [ 1004.003774] NFS: nfs_pgio_result: 0, (status 0), tk_ops (null)
>> [ 1004.003989] --> nfs4_read_done
>> [ 1004.004224] nfs_readpage_done: 0
>> [ 1004.004459] nfs_pgio_result: 0
>> [ 1004.004691] nfs_readpage_result: eof 0, res.count 4096, args.count 2048
>> [ 1004.004926] nfs_readpage_retry: tk_ops (null)
>>
>> Panic messages as,
>>
>> [ 1004.005170] BUG: unable to handle kernel NULL pointer dereference at (null)
>> [ 1004.005452] IP: [<ffffffffa0075f8a>] rpc_restart_call_prepare+0x2a/0x50 [sunrpc]
>> [ 1004.005702] PGD 0
>> [ 1004.005937] Oops: 0000 [#1]
>> [ 1004.006175] Modules linked in: blocklayoutdriver(OE) nfsv4(OE) nfs(OE) fscache(E) xfs libcrc32c btrfs coretemp crct10dif_pclmul ppdev crc32_pclmul crc32c_intel ghash_clmulni_intel vmw_balloon vmw_vmci parport_pc parport nfsd(OE) shpchp xor raid6_pq i2c_piix4 auth_rpcgss nfs_acl lockd(E) grace sunrpc(E) vmwgfx drm_kms_helper ttm drm serio_raw e1000 mptspi scsi_transport_spi mptscsih ata_generic mptbase pata_acpi [last unloaded: fscache]
>> [ 1004.007611] CPU: 0 PID: 3489 Comm: kworker/0:2 Tainted: G OE 4.3.0-rc1+ #252
>> [ 1004.007920] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/20/2014
>> [ 1004.008571] Workqueue: events bl_read_cleanup [blocklayoutdriver]
>> [ 1004.008917] task: ffff88006ceab080 ti: ffff880017700000 task.ti: ffff880017700000
>> [ 1004.009315] RIP: 0010:[<ffffffffa0075f8a>] [<ffffffffa0075f8a>] rpc_restart_call_prepare+0x2a/0x50 [sunrpc]
>> [ 1004.010152] RSP: 0018:ffff880017703cc8 EFLAGS: 00010246
>> [ 1004.010589] RAX: 0000000000000000 RBX: ffff880017726000 RCX: 0000000000000006
>> [ 1004.011007] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff8800177260d8
>> [ 1004.011428] RBP: ffff880017703cc8 R08: 0000000000000001 R09: 0000000000000000
>> [ 1004.011831] R10: 0000000000000000 R11: 0000000000000001 R12: ffff8800177260d8
>> [ 1004.012237] R13: ffff8800686008b0 R14: 0000000000000000 R15: ffff880017726160
>> [ 1004.012666] FS: 0000000000000000(0000) GS:ffffffff81c29000(0000) knlGS:0000000000000000
>> [ 1004.013478] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [ 1004.013930] CR2: 0000000000000000 CR3: 000000006ccbe000 CR4: 00000000001406f0
>> [ 1004.014592] Stack:
>> [ 1004.015103] ffff880017703cf0 ffffffffa04c436e ffff8800177260d8 ffff880017726000
>> [ 1004.015611] ffff8800686008b0 ffff880017703d18 ffffffffa04c2fb8 ffff880017726160
>> [ 1004.016105] ffff880017726000 ffff88007ff43100 ffff880017703d40 ffffffffa05349c4
>> [ 1004.016565] Call Trace:
>> [ 1004.017071] [<ffffffffa04c436e>] nfs_readpage_result+0x11e/0x130 [nfs]
>> [ 1004.017546] [<ffffffffa04c2fb8>] nfs_pgio_result+0x88/0xa0 [nfs]
>> [ 1004.018009] [<ffffffffa05349c4>] pnfs_ld_read_done+0x44/0xf0 [nfsv4]
>> [ 1004.018469] [<ffffffffa04a8532>] bl_read_cleanup+0x22/0x50 [blocklayoutdriver]
>> [ 1004.018938] [<ffffffff810a388c>] process_one_work+0x21c/0x4c0
>> [ 1004.019406] [<ffffffff810a37dd>] ? process_one_work+0x16d/0x4c0
>> [ 1004.019876] [<ffffffff810a3b7a>] worker_thread+0x4a/0x440
>> [ 1004.020339] [<ffffffff810a3b30>] ? process_one_work+0x4c0/0x4c0
>> [ 1004.020795] [<ffffffff810a3b30>] ? process_one_work+0x4c0/0x4c0
>> [ 1004.021289] [<ffffffff810a8d85>] kthread+0xf5/0x110
>> [ 1004.021735] [<ffffffff810a8c90>] ? kthread_create_on_node+0x240/0x240
>> [ 1004.022177] [<ffffffff8172cd1f>] ret_from_fork+0x3f/0x70
>> [ 1004.022604] [<ffffffff810a8c90>] ? kthread_create_on_node+0x240/0x240
>> [ 1004.023025] Code: 00 0f 1f 44 00 00 31 c0 f6 87 e9 00 00 00 01 55 48 89 e5 75 29 48 8b 47 58 48 c7 47 50 80 42 07 a0 c7 87 e4 00 00 00 00 00 00 00 <48> 83 38 00 74 0f 48 c7 47 50 b0 f1 07 a0 b8 01 00 00 00 5d c3
>> [ 1004.024344] RIP [<ffffffffa0075f8a>] rpc_restart_call_prepare+0x2a/0x50 [sunrpc]
>> [ 1004.024773] RSP <ffff880017703cc8>
>> [ 1004.025228] CR2: 0000000000000000
>>
On 10/13/2015 21:45, Trond Myklebust wrote:
> On Tue, Oct 13, 2015 at 8:45 AM, Kinglong Mee <[email protected]> wrote:
>> ping ...
>>
>> What's your opinion about this problem ?
>>
>> If read/write of block layout file with bad length (res.count != arg.count),
>> should nfs retry? NFS try to call rpc_restart_call_prepare() right now,
>> that cause a panic with uninitialized task.
>
> The client should not be attempting to read more data than what was
> requested by the O_DIRECT read request. It should be strictly
> respecting the boundaries of the user buffer that was supplied.
Yes, that's right.
> Any idea why this is happening?
As post before, bl_read_pagelist() return a longer result that causes the panic.
>>> [ 1004.001842] bl_read_pagelist enter nr_pages 1 offset 2048 count 2048
>>> [ 1004.002110] bl_read_pagelist: pg_offset 2048
>>> [ 1004.002370] bl_read_pagelist: pg_len 2048 is_dio
>>> [ 1004.002617] bl_read_pagelist: pg_len 2048 after do_add_page_to_bio
>>> [ 1004.002853] bl_read_pagelist: 2048 4096 "(isect << SECTOR_SHIFT) < header->inode->i_size"
>>> [ 1004.003774] NFS: nfs_pgio_result: 0, (status 0), tk_ops (null)
>>> [ 1004.003989] --> nfs4_read_done
>>> [ 1004.004224] nfs_readpage_done: 0
>>> [ 1004.004459] nfs_pgio_result: 0
>>> [ 1004.004691] nfs_readpage_result: eof 0, res.count 4096, args.count 2048
>>> [ 1004.004926] nfs_readpage_retry: tk_ops (null)
I test with the following program with pnfs of block layout,
-------------------------------------------------------
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
int main(int argc, char **argv)
{
char buf[2048];
char *filename = NULL;
int fd = -1;
if (argc < 2) {
printf("Usage: %s filename\n", argv[0]);
return 0;
}
filename = argv[1];
fd = open(filename, O_RDONLY | O_DIRECT);
if (fd < 0) {
printf("Open %s fail: %m\n", filename);
return 1;
}
if (lseek(fd, 2048, SEEK_SET) != 2048) {
printf("Seek %s's 2048 fail: %m\n", filename);
goto out;
}
if (read(fd, buf, sizeof(buf)) != sizeof(buf))
printf("Read 2048 bityes data from %s fail: %m\n", filename);
out:
close(fd);
return 0;
}
thanks,
Kinglong Mee
>> On 9/21/2015 11:22, Kinglong Mee wrote:
>>> It caused by rpc_restart_call_prepare with an uninitialized task
>>> for the pnfs do I/O locally without sending any RPC to MDS.
>>>
>>> Some debug messages,
>>>
>>> [ 1004.001842] bl_read_pagelist enter nr_pages 1 offset 2048 count 2048
>>> [ 1004.002110] bl_read_pagelist: pg_offset 2048
>>> [ 1004.002370] bl_read_pagelist: pg_len 2048 is_dio
>>> [ 1004.002617] bl_read_pagelist: pg_len 2048 after do_add_page_to_bio
>>> [ 1004.002853] bl_read_pagelist: 2048 4096 "(isect << SECTOR_SHIFT) < header->inode->i_size"
>>> [ 1004.003774] NFS: nfs_pgio_result: 0, (status 0), tk_ops (null)
>>> [ 1004.003989] --> nfs4_read_done
>>> [ 1004.004224] nfs_readpage_done: 0
>>> [ 1004.004459] nfs_pgio_result: 0
>>> [ 1004.004691] nfs_readpage_result: eof 0, res.count 4096, args.count 2048
>>> [ 1004.004926] nfs_readpage_retry: tk_ops (null)
>>>
>>> Panic messages as,
>>>
>>> [ 1004.005170] BUG: unable to handle kernel NULL pointer dereference at (null)
>>> [ 1004.005452] IP: [<ffffffffa0075f8a>] rpc_restart_call_prepare+0x2a/0x50 [sunrpc]
>>> [ 1004.005702] PGD 0
>>> [ 1004.005937] Oops: 0000 [#1]
>>> [ 1004.006175] Modules linked in: blocklayoutdriver(OE) nfsv4(OE) nfs(OE) fscache(E) xfs libcrc32c btrfs coretemp crct10dif_pclmul ppdev crc32_pclmul crc32c_intel ghash_clmulni_intel vmw_balloon vmw_vmci parport_pc parport nfsd(OE) shpchp xor raid6_pq i2c_piix4 auth_rpcgss nfs_acl lockd(E) grace sunrpc(E) vmwgfx drm_kms_helper ttm drm serio_raw e1000 mptspi scsi_transport_spi mptscsih ata_generic mptbase pata_acpi [last unloaded: fscache]
>>> [ 1004.007611] CPU: 0 PID: 3489 Comm: kworker/0:2 Tainted: G OE 4.3.0-rc1+ #252
>>> [ 1004.007920] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/20/2014
>>> [ 1004.008571] Workqueue: events bl_read_cleanup [blocklayoutdriver]
>>> [ 1004.008917] task: ffff88006ceab080 ti: ffff880017700000 task.ti: ffff880017700000
>>> [ 1004.009315] RIP: 0010:[<ffffffffa0075f8a>] [<ffffffffa0075f8a>] rpc_restart_call_prepare+0x2a/0x50 [sunrpc]
>>> [ 1004.010152] RSP: 0018:ffff880017703cc8 EFLAGS: 00010246
>>> [ 1004.010589] RAX: 0000000000000000 RBX: ffff880017726000 RCX: 0000000000000006
>>> [ 1004.011007] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff8800177260d8
>>> [ 1004.011428] RBP: ffff880017703cc8 R08: 0000000000000001 R09: 0000000000000000
>>> [ 1004.011831] R10: 0000000000000000 R11: 0000000000000001 R12: ffff8800177260d8
>>> [ 1004.012237] R13: ffff8800686008b0 R14: 0000000000000000 R15: ffff880017726160
>>> [ 1004.012666] FS: 0000000000000000(0000) GS:ffffffff81c29000(0000) knlGS:0000000000000000
>>> [ 1004.013478] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>> [ 1004.013930] CR2: 0000000000000000 CR3: 000000006ccbe000 CR4: 00000000001406f0
>>> [ 1004.014592] Stack:
>>> [ 1004.015103] ffff880017703cf0 ffffffffa04c436e ffff8800177260d8 ffff880017726000
>>> [ 1004.015611] ffff8800686008b0 ffff880017703d18 ffffffffa04c2fb8 ffff880017726160
>>> [ 1004.016105] ffff880017726000 ffff88007ff43100 ffff880017703d40 ffffffffa05349c4
>>> [ 1004.016565] Call Trace:
>>> [ 1004.017071] [<ffffffffa04c436e>] nfs_readpage_result+0x11e/0x130 [nfs]
>>> [ 1004.017546] [<ffffffffa04c2fb8>] nfs_pgio_result+0x88/0xa0 [nfs]
>>> [ 1004.018009] [<ffffffffa05349c4>] pnfs_ld_read_done+0x44/0xf0 [nfsv4]
>>> [ 1004.018469] [<ffffffffa04a8532>] bl_read_cleanup+0x22/0x50 [blocklayoutdriver]
>>> [ 1004.018938] [<ffffffff810a388c>] process_one_work+0x21c/0x4c0
>>> [ 1004.019406] [<ffffffff810a37dd>] ? process_one_work+0x16d/0x4c0
>>> [ 1004.019876] [<ffffffff810a3b7a>] worker_thread+0x4a/0x440
>>> [ 1004.020339] [<ffffffff810a3b30>] ? process_one_work+0x4c0/0x4c0
>>> [ 1004.020795] [<ffffffff810a3b30>] ? process_one_work+0x4c0/0x4c0
>>> [ 1004.021289] [<ffffffff810a8d85>] kthread+0xf5/0x110
>>> [ 1004.021735] [<ffffffff810a8c90>] ? kthread_create_on_node+0x240/0x240
>>> [ 1004.022177] [<ffffffff8172cd1f>] ret_from_fork+0x3f/0x70
>>> [ 1004.022604] [<ffffffff810a8c90>] ? kthread_create_on_node+0x240/0x240
>>> [ 1004.023025] Code: 00 0f 1f 44 00 00 31 c0 f6 87 e9 00 00 00 01 55 48 89 e5 75 29 48 8b 47 58 48 c7 47 50 80 42 07 a0 c7 87 e4 00 00 00 00 00 00 00 <48> 83 38 00 74 0f 48 c7 47 50 b0 f1 07 a0 b8 01 00 00 00 5d c3
>>> [ 1004.024344] RIP [<ffffffffa0075f8a>] rpc_restart_call_prepare+0x2a/0x50 [sunrpc]
>>> [ 1004.024773] RSP <ffff880017703cc8>
>>> [ 1004.025228] CR2: 0000000000000000
>>>
>
On Wed, Oct 14, 2015 at 3:18 AM, Kinglong Mee <[email protected]> wrote:
> On 10/13/2015 21:45, Trond Myklebust wrote:
>> On Tue, Oct 13, 2015 at 8:45 AM, Kinglong Mee <[email protected]> wrote:
>>> ping ...
>>>
>>> What's your opinion about this problem ?
>>>
>>> If read/write of block layout file with bad length (res.count != arg.count),
>>> should nfs retry? NFS try to call rpc_restart_call_prepare() right now,
>>> that cause a panic with uninitialized task.
>>
>> The client should not be attempting to read more data than what was
>> requested by the O_DIRECT read request. It should be strictly
>> respecting the boundaries of the user buffer that was supplied.
>
> Yes, that's right.
>
>> Any idea why this is happening?
>
> As post before, bl_read_pagelist() return a longer result that causes the panic.
>
>>>> [ 1004.001842] bl_read_pagelist enter nr_pages 1 offset 2048 count 2048
>>>> [ 1004.002110] bl_read_pagelist: pg_offset 2048
>>>> [ 1004.002370] bl_read_pagelist: pg_len 2048 is_dio
>>>> [ 1004.002617] bl_read_pagelist: pg_len 2048 after do_add_page_to_bio
>>>> [ 1004.002853] bl_read_pagelist: 2048 4096 "(isect << SECTOR_SHIFT) < header->inode->i_size"
>>>> [ 1004.003774] NFS: nfs_pgio_result: 0, (status 0), tk_ops (null)
>>>> [ 1004.003989] --> nfs4_read_done
>>>> [ 1004.004224] nfs_readpage_done: 0
>>>> [ 1004.004459] nfs_pgio_result: 0
>>>> [ 1004.004691] nfs_readpage_result: eof 0, res.count 4096, args.count 2048
>>>> [ 1004.004926] nfs_readpage_retry: tk_ops (null)
Right, but that means one of two things: Either we need to fix
bl_read_pagelist, or we need to fall back to read-through-MDS in this
case.
On 10/14/2015 18:53, Trond Myklebust wrote:
> On Wed, Oct 14, 2015 at 3:18 AM, Kinglong Mee <[email protected]> wrote:
>> On 10/13/2015 21:45, Trond Myklebust wrote:
>>> On Tue, Oct 13, 2015 at 8:45 AM, Kinglong Mee <[email protected]> wrote:
>>>> ping ...
>>>>
>>>> What's your opinion about this problem ?
>>>>
>>>> If read/write of block layout file with bad length (res.count != arg.count),
>>>> should nfs retry? NFS try to call rpc_restart_call_prepare() right now,
>>>> that cause a panic with uninitialized task.
>>>
>>> The client should not be attempting to read more data than what was
>>> requested by the O_DIRECT read request. It should be strictly
>>> respecting the boundaries of the user buffer that was supplied.
>>
>> Yes, that's right.
>>
>>> Any idea why this is happening?
>>
>> As post before, bl_read_pagelist() return a longer result that causes the panic.
>>
>>>>> [ 1004.001842] bl_read_pagelist enter nr_pages 1 offset 2048 count 2048
>>>>> [ 1004.002110] bl_read_pagelist: pg_offset 2048
>>>>> [ 1004.002370] bl_read_pagelist: pg_len 2048 is_dio
>>>>> [ 1004.002617] bl_read_pagelist: pg_len 2048 after do_add_page_to_bio
>>>>> [ 1004.002853] bl_read_pagelist: 2048 4096 "(isect << SECTOR_SHIFT) < header->inode->i_size"
>>>>> [ 1004.003774] NFS: nfs_pgio_result: 0, (status 0), tk_ops (null)
>>>>> [ 1004.003989] --> nfs4_read_done
>>>>> [ 1004.004224] nfs_readpage_done: 0
>>>>> [ 1004.004459] nfs_pgio_result: 0
>>>>> [ 1004.004691] nfs_readpage_result: eof 0, res.count 4096, args.count 2048
>>>>> [ 1004.004926] nfs_readpage_retry: tk_ops (null)
>
> Right, but that means one of two things: Either we need to fix
> bl_read_pagelist, or we need to fall back to read-through-MDS in this
> case.
I don't know the restrict of calling bl_read_pagelist,
Should the offset or count be aligning to PAGE_SIZE or not?
If not, there maybe some problem exist in bl_read_pagelist.
Otherwise, if bl_read_pagelist success but with res.count
that not equal to args.count, nfs should fall back to read-through-MDS.
So, both need be fixed.
thanks,
Kinglong Mee
Blocklayout uses file offset for the read-back page's offset of first writing,
it's definitely wrong, it writes data to bad address of page that cause userspace
application segment fault. It must be the page base stored in header->args.pgbase.
Also, the pg_offset has no influence with isect and extent length.
Note: The offset of the non-first page is always zero.
Ps: A test program will segment fault at read() as,
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
int main(int argc, char **argv)
{
char buf[2049];
char *filename = NULL;
int fd = -1;
if (argc < 2) {
printf("Usage: %s filename\n", argv[0]);
return 0;
}
filename = argv[1];
fd = open(filename, O_RDONLY | O_DIRECT);
if (fd < 0) {
printf("Open %s fail: %m\n", filename);
return 1;
}
lseek(fd, 2048, SEEK_SET);
if (read(fd, buf, sizeof(buf) - 1) != (sizeof(buf) - 1))
printf("Read 4096 bityes data from %s fail: %m\n", filename);
out:
close(fd);
return 0;
}
Signed-off-by: Kinglong Mee <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 9cd4eb3..ddd0138 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -229,7 +229,7 @@ bl_read_pagelist(struct nfs_pgio_header *header)
struct parallel_io *par;
loff_t f_offset = header->args.offset;
size_t bytes_left = header->args.count;
- unsigned int pg_offset, pg_len;
+ unsigned int pg_offset = header->args.pgbase, pg_len;
struct page **pages = header->args.pages;
int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
const bool is_dio = (header->dreq != NULL);
@@ -262,7 +262,6 @@ bl_read_pagelist(struct nfs_pgio_header *header)
extent_length = be.be_length - (isect - be.be_f_offset);
}
- pg_offset = f_offset & ~PAGE_CACHE_MASK;
if (is_dio) {
if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
pg_len = PAGE_CACHE_SIZE - pg_offset;
@@ -273,9 +272,6 @@ bl_read_pagelist(struct nfs_pgio_header *header)
pg_len = PAGE_CACHE_SIZE;
}
- isect += (pg_offset >> SECTOR_SHIFT);
- extent_length -= (pg_offset >> SECTOR_SHIFT);
-
if (is_hole(&be)) {
bio = bl_submit_bio(READ, bio);
/* Fill hole w/ zeroes w/o accessing device */
@@ -301,6 +297,7 @@ bl_read_pagelist(struct nfs_pgio_header *header)
extent_length -= (pg_len >> SECTOR_SHIFT);
f_offset += pg_len;
bytes_left -= pg_len;
+ pg_offset = 0;
}
if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
header->res.eof = 1;
--
2.5.0
If non rpc-based layout driver return bad length of data, nfs retries
by calling rpc_restart_call_prepare() that cause an NULL reference panic.
This patch lets nfs retry through MDS for non rpc-based layout driver
return bad length of data.
[13034.883329] BUG: unable to handle kernel NULL pointer dereference at (null)
[13034.884902] IP: [<ffffffffa00db372>] rpc_restart_call_prepare+0x62/0x90 [sunrpc]
[13034.886558] PGD 0
[13034.888126] Oops: 0000 [#1] KASAN
[13034.889710] Modules linked in: blocklayoutdriver(OE) nfsv4(OE) nfs(OE) fscache(E) nfsd(OE) xfs libcrc32c coretemp btrfs crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel ppdev vmw_balloon auth_rpcgss shpchp nfs_acl lockd vmw_vmci parport_pc xor raid6_pq grace parport sunrpc i2c_piix4 vmwgfx drm_kms_helper ttm drm mptspi e1000 serio_raw scsi_transport_spi mptscsih mptbase ata_generic pata_acpi [last unloaded: fscache]
[13034.898260] CPU: 0 PID: 10112 Comm: kworker/0:1 Tainted: G OE 4.3.0-rc5+ #279
[13034.899932] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
[13034.903342] Workqueue: events bl_read_cleanup [blocklayoutdriver]
[13034.905059] task: ffff88006a9148c0 ti: ffff880035e90000 task.ti: ffff880035e90000
[13034.906827] RIP: 0010:[<ffffffffa00db372>] [<ffffffffa00db372>] rpc_restart_call_prepare+0x62/0x90 [sunrpc]
[13034.910522] RSP: 0018:ffff880035e97b58 EFLAGS: 00010282
[13034.912378] RAX: fffffbfff04a5a94 RBX: ffff880068fe4858 RCX: 0000000000000003
[13034.914339] RDX: dffffc0000000000 RSI: 0000000000000003 RDI: 0000000000000282
[13034.916236] RBP: ffff880035e97b68 R08: 0000000000000001 R09: 0000000000000001
[13034.918229] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
[13034.920007] R13: ffff880068fe4858 R14: ffff880068fe4a60 R15: 0000000000001000
[13034.921845] FS: 0000000000000000(0000) GS:ffffffff82247000(0000) knlGS:0000000000000000
[13034.923645] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[13034.925525] CR2: 0000000000000000 CR3: 00000000063dd000 CR4: 00000000001406f0
[13034.932808] Stack:
[13034.934813] ffff880068fe4780 0000000000001000 ffff880035e97ba8 ffffffffa08800d2
[13034.936675] ffffffffa088029d ffff880068fe4780 ffff880068fe4858 ffffffffa089c0a0
[13034.938593] ffff880068fe47e0 ffff88005d59faf0 ffff880035e97be0 ffffffffa087e08f
[13034.940454] Call Trace:
[13034.942388] [<ffffffffa08800d2>] nfs_readpage_result+0x112/0x200 [nfs]
[13034.944317] [<ffffffffa088029d>] ? nfs_readpage_done+0xdd/0x160 [nfs]
[13034.946267] [<ffffffffa087e08f>] nfs_pgio_result+0x9f/0x120 [nfs]
[13034.948166] [<ffffffffa09266cc>] pnfs_ld_read_done+0x7c/0x1e0 [nfsv4]
[13034.950247] [<ffffffffa03b07ee>] bl_read_cleanup+0x2e/0x60 [blocklayoutdriver]
[13034.952156] [<ffffffff810ebf62>] process_one_work+0x412/0x870
[13034.954102] [<ffffffff810ebe84>] ? process_one_work+0x334/0x870
[13034.955949] [<ffffffff810ebb50>] ? queue_delayed_work_on+0x40/0x40
[13034.957985] [<ffffffff810ec441>] worker_thread+0x81/0x6a0
[13034.959817] [<ffffffff810ec3c0>] ? process_one_work+0x870/0x870
[13034.961785] [<ffffffff810f43bd>] kthread+0x17d/0x1a0
[13034.963544] [<ffffffff810f4240>] ? kthread_create_on_node+0x330/0x330
[13034.965479] [<ffffffff81100428>] ? finish_task_switch+0x88/0x220
[13034.967223] [<ffffffff810f4240>] ? kthread_create_on_node+0x330/0x330
[13034.968929] [<ffffffff81b6ae5f>] ret_from_fork+0x3f/0x70
[13034.970534] [<ffffffff810f4240>] ? kthread_create_on_node+0x330/0x330
[13034.972176] Code: c7 43 50 40 84 0d a0 e8 3d fe 1c e1 48 8d 7b 58 c7 83 e4 00 00 00 00 00 00 00 e8 ca fe 1c e1 4c 8b 63 58 4c 89 e7 e8 be fe 1c e1 <49> 83 3c 24 00 74 12 48 c7 43 50 f0 a2 0e a0 b8 01 00 00 00 5b
[13034.977148] RIP [<ffffffffa00db372>] rpc_restart_call_prepare+0x62/0x90 [sunrpc]
[13034.978780] RSP <ffff880035e97b58>
[13034.980399] CR2: 0000000000000000
Signed-off-by: Kinglong Mee <[email protected]>
---
fs/nfs/pnfs.c | 12 +++++++-----
fs/nfs/read.c | 9 ++++++++-
fs/nfs/write.c | 7 +++++++
3 files changed, 22 insertions(+), 6 deletions(-)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8abe271..93496c0 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1912,12 +1912,13 @@ static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
*/
void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
{
- trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
- if (!hdr->pnfs_error) {
+ if (likely(!hdr->pnfs_error)) {
pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
hdr->mds_offset + hdr->res.count);
hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
- } else
+ }
+ trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
+ if (unlikely(hdr->pnfs_error))
pnfs_ld_handle_write_error(hdr);
hdr->mds_ops->rpc_release(hdr);
}
@@ -2028,11 +2029,12 @@ static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
*/
void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
{
- trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
if (likely(!hdr->pnfs_error)) {
__nfs4_read_done_cb(hdr);
hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
- } else
+ }
+ trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
+ if (unlikely(hdr->pnfs_error))
pnfs_ld_handle_read_error(hdr);
hdr->mds_ops->rpc_release(hdr);
}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 01b8cc8..0a5e33f 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -246,6 +246,13 @@ static void nfs_readpage_retry(struct rpc_task *task,
nfs_set_pgio_error(hdr, -EIO, argp->offset);
return;
}
+
+ /* For non rpc-based layout drivers, retry-through-MDS */
+ if (!task->tk_ops) {
+ hdr->pnfs_error = -EAGAIN;
+ return;
+ }
+
/* Yes, so retry the read at the end of the hdr */
hdr->mds_offset += resp->count;
argp->offset += resp->count;
@@ -268,7 +275,7 @@ static void nfs_readpage_result(struct rpc_task *task,
hdr->good_bytes = bound - hdr->io_start;
}
spin_unlock(&hdr->lock);
- } else if (hdr->res.count != hdr->args.count)
+ } else if (hdr->res.count < hdr->args.count)
nfs_readpage_retry(task, hdr);
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 75ab762..7b93164 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1505,6 +1505,13 @@ static void nfs_writeback_result(struct rpc_task *task,
task->tk_status = -EIO;
return;
}
+
+ /* For non rpc-based layout drivers, retry-through-MDS */
+ if (!task->tk_ops) {
+ hdr->pnfs_error = -EAGAIN;
+ return;
+ }
+
/* Was this an NFSv2 write or an NFSv3 stable write? */
if (resp->verf->committed != NFS_UNSTABLE) {
/* Resend from where the server left off */
--
2.5.0