2012-05-09 16:51:47

by Myklebust, Trond

[permalink] [raw]
Subject: [PATCH] NFS: Prevent a deadlock in the new read and write code

In order to avoid an ABBA lock ordering issue, the write code must
unlock and release the nfs_page _before_ it calls end_page_writeback.
Since the last release of the nfs_page will trigger a call to put
the open_context, we need to ensure that something else holds a
reference to the open context until we're done with the
end_page_writeback so that we don't end up calling iput while we're
still holding a PG_writeback lock.
We can fix this by changing nfs_writedata_release to put the open
context after we have processed all the page data.

The read code does not suffer from the same problem, since it doesn't
hold a lock on the nfs_page, so it just calls nfs_release_request after
unlocking the page itself. However, for symmetry reasons, we make the
same change to nfs_readdata_release.

Signed-off-by: Trond Myklebust <[email protected]>
Cc: Fred Isaman <[email protected]>
---
fs/nfs/read.c | 3 ++-
fs/nfs/write.c | 13 ++++++++++++-
2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f23cf25..dec47e5 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -87,8 +87,8 @@ void nfs_readdata_release(struct nfs_read_data *rdata)
{
struct nfs_pgio_header *hdr = rdata->header;
struct nfs_read_header *read_header = container_of(hdr, struct nfs_read_header, header);
+ struct nfs_open_context *ctx = rdata->args.context;

- put_nfs_open_context(rdata->args.context);
if (rdata->pages.pagevec != rdata->pages.page_array)
kfree(rdata->pages.pagevec);
if (rdata != &read_header->rpc_data)
@@ -97,6 +97,7 @@ void nfs_readdata_release(struct nfs_read_data *rdata)
rdata->header = NULL;
if (atomic_dec_and_test(&hdr->refcnt))
hdr->completion_ops->completion(hdr);
+ put_nfs_open_context(ctx);
}

static
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 6f263da..69825f7 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -119,8 +119,8 @@ void nfs_writedata_release(struct nfs_write_data *wdata)
{
struct nfs_pgio_header *hdr = wdata->header;
struct nfs_write_header *write_header = container_of(hdr, struct nfs_write_header, header);
+ struct nfs_open_context *ctx = wdata->args.context;

- put_nfs_open_context(wdata->args.context);
if (wdata->pages.pagevec != wdata->pages.page_array)
kfree(wdata->pages.pagevec);
if (wdata != &write_header->rpc_data)
@@ -129,6 +129,10 @@ void nfs_writedata_release(struct nfs_write_data *wdata)
wdata->header = NULL;
if (atomic_dec_and_test(&hdr->refcnt))
hdr->completion_ops->completion(hdr);
+ /* Note: put the open context after processing all the
+ * page data in order to avoid a deadlock in nfs_write_completion.
+ */
+ put_nfs_open_context(ctx);
}

static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
@@ -597,6 +601,13 @@ int nfs_write_need_commit(struct nfs_write_data *data)

#endif

+/*
+ * nfs_write_completion - process the results of the write RPC calls
+ *
+ * Note: this function assumes that the caller is holding a reference
+ * to the nfs_open_context in order to avoid a deadlock when calling
+ * nfs_end_page_writeback after nfs_unlock_request.
+ */
static void nfs_write_completion(struct nfs_pgio_header *hdr)
{
struct nfs_commit_info cinfo;
--
1.7.7.6



2012-05-09 19:15:57

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [PATCH] NFS: Prevent a deadlock in the new read and write code

T24gV2VkLCAyMDEyLTA1LTA5IGF0IDE4OjIyICswMDAwLCBBZGFtc29uLCBEcm9zIHdyb3RlOg0K
PiBPbiBNYXkgOSwgMjAxMiwgYXQgMjoyMCBQTSwgQWRhbXNvbiwgRHJvcyB3cm90ZToNCj4gDQo+
ID4gQUNLIC0gVGhpcyBmaXhlcyB0aGUgZXJyYW50IGJlaGF2aW9yIEkgd2FzIHNlZWluZy4NCj4g
PiANCj4gPiBCZWZvcmUgKG9uIGFueSBwcm90b2NvbCB2ZXJzaW9uIG1vdW50ZWQgYXQgL21udCk6
DQo+ID4gJCBkZCBpZj0vZGV2L3plcm8gb2Y9emVyb2ZpbGUgYnM9MTAwMjQgY291bnQ9MTAwMCA7
IHdoaWxlIHRydWU7IGRvIChybSAtcmYgL21udC9mb28gJiYgZWNobyBybSBvaykgfHwgbHMgLWxh
IC9tbnQvZm9vIDsgbWtkaXIgLXAgL21udC9mb28vOyBjcCB6ZXJvZmlsZSAvbW50L2Zvby9maWxl
IDsgZG9uZQ0KPiA+IDEwMDArMCByZWNvcmRzIGluDQo+ID4gMTAwMCswIHJlY29yZHMgb3V0DQo+
ID4gMTAwMjQwMDAgYnl0ZXMgKDEwIE1CKSBjb3BpZWQsIDAuMTQ2NzM4IHMsIDY4LjMgTUIvcw0K
PiA+IHJtIG9rDQo+ID4gcm06IGNhbm5vdCByZW1vdmUgYC9tbnQvZm9vJzogRGlyZWN0b3J5IG5v
dCBlbXB0eQ0KPiA+IGxzOiAvbW50L2Zvby8ubmZzMDAwMDAwMDAwMDA0MDg5ZDAwMDAwMDAxOiBO
byBzdWNoIGZpbGUgb3IgZGlyZWN0b3J5DQo+ID4gdG90YWwgOTgwMA0KPiA+IGRyd3hyd3hyLXgg
MiBkcm9zIGRyb3MgICAgIDQwOTYgTWF5ICA5IDA5OjQ2IC4NCj4gPiBkcnd4cnd4cnd4IDUgcm9v
dCByb290ICAgICA0MDk2IE1heSAgOSAwOTo0NiAuLg0KPiA+IC1ydy1ydy1yLS0gMCBkcm9zIGRy
b3MgMTAwMjQwMDAgTWF5ICA5IDA5OjQ2IC5uZnMwMDAwMDAwMDAwMDQwODlkMDAwMDAwMDENCj4g
PiBybTogY2Fubm90IHJlbW92ZSBgL21udC9mb28nOiBEaXJlY3Rvcnkgbm90IGVtcHR5DQo+ID4g
Li4uDQo+ID4gDQo+IA0KPiBPb3BzLCB0aGlzIGlzIGFmdGVyIHRoZSBwYXRjaCBpcyBhcHBsaWVk
Og0KPiANCj4gPiAkIGRkIGlmPS9kZXYvemVybyBvZj16ZXJvZmlsZSBicz0xMDAyNCBjb3VudD0x
MDAwIDsgd2hpbGUgdHJ1ZTsgZG8gKHJtIC1yZiAvbW50L2ZvbyAmJiBlY2hvIHJtIG9rKSB8fCBs
cyAtbGEgL21udC9mb28gOyBta2RpciAtcCAvbW50L2Zvby87IGNwIHplcm9maWxlIC9tbnQvZm9v
L2ZpbGUgOyBkb25lDQo+ID4gMTAwMCswIHJlY29yZHMgaW4NCj4gPiAxMDAwKzAgcmVjb3JkcyBv
dXQNCj4gPiAxMDAyNDAwMCBieXRlcyAoMTAgTUIpIGNvcGllZCwgMC4xMzc1OTYgcywgNzIuOSBN
Qi9zDQo+ID4gcm0gb2sNCj4gPiBybSBvaw0KPiA+IHJtIG9rDQo+ID4gcm0gb2sNCj4gPiAuLi4N
Cj4gPiANCj4gPiAtZHJvcw0KDQpPSy4uLiBIYXZpbmcgdGhvdWdodCBhIGJpdCBtb3JlIGFib3V0
IHRoZSBwcm9ibGVtLCBJJ2QgcHJlZmVyIHRvIHNlZSB0aGUNCndyaXRlIGNvZGUgZG8gdGhlIHNh
bWUgcmVhbGx5IGFzIHRoZSByZWFkIGNvZGUgaXMgZG9pbmcuIGkuZS4gcmVsZWFzZQ0KdGhlIG5m
c19wYWdlIGFmdGVyIHRoZSBwYWdlX3dyaXRlYmFjayBsb2NrIGhhcyBiZWVuIHJlbGVhc2VkLg0K
DQpJJ3ZlIGp1c3Qgd3JpdHRlbiBhIHNlcmllcyBvZiBwYXRjaGVzIHRvIGRvIHRoaXMuIFRoZSBm
aXJzdCBwYXRjaCBpbiB0aGUNCnNlcmllcyBpcyB0aGUgYWN0dWFsIGZpeC4gVGhlIHJlc3QgaXMg
Y2xlYW4gdXBzIGluIG9yZGVyIHRvIG1ha2UgdGhlDQpjb2RlIGEgYml0IG1vcmUgcmVhZGFibGUu
Li4NCg0KLS0gDQpUcm9uZCBNeWtsZWJ1c3QNCkxpbnV4IE5GUyBjbGllbnQgbWFpbnRhaW5lcg0K
DQpOZXRBcHANClRyb25kLk15a2xlYnVzdEBuZXRhcHAuY29tDQp3d3cubmV0YXBwLmNvbQ0KDQo=

2012-05-09 18:22:24

by Adamson, Dros

[permalink] [raw]
Subject: Re: [PATCH] NFS: Prevent a deadlock in the new read and write code


On May 9, 2012, at 2:20 PM, Adamson, Dros wrote:

> ACK - This fixes the errant behavior I was seeing.
>
> Before (on any protocol version mounted at /mnt):
> $ dd if=/dev/zero of=zerofile bs=10024 count=1000 ; while true; do (rm -rf /mnt/foo && echo rm ok) || ls -la /mnt/foo ; mkdir -p /mnt/foo/; cp zerofile /mnt/foo/file ; done
> 1000+0 records in
> 1000+0 records out
> 10024000 bytes (10 MB) copied, 0.146738 s, 68.3 MB/s
> rm ok
> rm: cannot remove `/mnt/foo': Directory not empty
> ls: /mnt/foo/.nfs000000000004089d00000001: No such file or directory
> total 9800
> drwxrwxr-x 2 dros dros 4096 May 9 09:46 .
> drwxrwxrwx 5 root root 4096 May 9 09:46 ..
> -rw-rw-r-- 0 dros dros 10024000 May 9 09:46 .nfs000000000004089d00000001
> rm: cannot remove `/mnt/foo': Directory not empty
> ...
>

Oops, this is after the patch is applied:

> $ dd if=/dev/zero of=zerofile bs=10024 count=1000 ; while true; do (rm -rf /mnt/foo && echo rm ok) || ls -la /mnt/foo ; mkdir -p /mnt/foo/; cp zerofile /mnt/foo/file ; done
> 1000+0 records in
> 1000+0 records out
> 10024000 bytes (10 MB) copied, 0.137596 s, 72.9 MB/s
> rm ok
> rm ok
> rm ok
> rm ok
> ...
>
> -dros
>
> On May 9, 2012, at 12:51 PM, Trond Myklebust wrote:
>
>> In order to avoid an ABBA lock ordering issue, the write code must
>> unlock and release the nfs_page _before_ it calls end_page_writeback.
>> Since the last release of the nfs_page will trigger a call to put
>> the open_context, we need to ensure that something else holds a
>> reference to the open context until we're done with the
>> end_page_writeback so that we don't end up calling iput while we're
>> still holding a PG_writeback lock.
>> We can fix this by changing nfs_writedata_release to put the open
>> context after we have processed all the page data.
>>
>> The read code does not suffer from the same problem, since it doesn't
>> hold a lock on the nfs_page, so it just calls nfs_release_request after
>> unlocking the page itself. However, for symmetry reasons, we make the
>> same change to nfs_readdata_release.
>>
>> Signed-off-by: Trond Myklebust <[email protected]>
>> Cc: Fred Isaman <[email protected]>
>> ---
>> fs/nfs/read.c | 3 ++-
>> fs/nfs/write.c | 13 ++++++++++++-
>> 2 files changed, 14 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
>> index f23cf25..dec47e5 100644
>> --- a/fs/nfs/read.c
>> +++ b/fs/nfs/read.c
>> @@ -87,8 +87,8 @@ void nfs_readdata_release(struct nfs_read_data *rdata)
>> {
>> struct nfs_pgio_header *hdr = rdata->header;
>> struct nfs_read_header *read_header = container_of(hdr, struct nfs_read_header, header);
>> + struct nfs_open_context *ctx = rdata->args.context;
>>
>> - put_nfs_open_context(rdata->args.context);
>> if (rdata->pages.pagevec != rdata->pages.page_array)
>> kfree(rdata->pages.pagevec);
>> if (rdata != &read_header->rpc_data)
>> @@ -97,6 +97,7 @@ void nfs_readdata_release(struct nfs_read_data *rdata)
>> rdata->header = NULL;
>> if (atomic_dec_and_test(&hdr->refcnt))
>> hdr->completion_ops->completion(hdr);
>> + put_nfs_open_context(ctx);
>> }
>>
>> static
>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>> index 6f263da..69825f7 100644
>> --- a/fs/nfs/write.c
>> +++ b/fs/nfs/write.c
>> @@ -119,8 +119,8 @@ void nfs_writedata_release(struct nfs_write_data *wdata)
>> {
>> struct nfs_pgio_header *hdr = wdata->header;
>> struct nfs_write_header *write_header = container_of(hdr, struct nfs_write_header, header);
>> + struct nfs_open_context *ctx = wdata->args.context;
>>
>> - put_nfs_open_context(wdata->args.context);
>> if (wdata->pages.pagevec != wdata->pages.page_array)
>> kfree(wdata->pages.pagevec);
>> if (wdata != &write_header->rpc_data)
>> @@ -129,6 +129,10 @@ void nfs_writedata_release(struct nfs_write_data *wdata)
>> wdata->header = NULL;
>> if (atomic_dec_and_test(&hdr->refcnt))
>> hdr->completion_ops->completion(hdr);
>> + /* Note: put the open context after processing all the
>> + * page data in order to avoid a deadlock in nfs_write_completion.
>> + */
>> + put_nfs_open_context(ctx);
>> }
>>
>> static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
>> @@ -597,6 +601,13 @@ int nfs_write_need_commit(struct nfs_write_data *data)
>>
>> #endif
>>
>> +/*
>> + * nfs_write_completion - process the results of the write RPC calls
>> + *
>> + * Note: this function assumes that the caller is holding a reference
>> + * to the nfs_open_context in order to avoid a deadlock when calling
>> + * nfs_end_page_writeback after nfs_unlock_request.
>> + */
>> static void nfs_write_completion(struct nfs_pgio_header *hdr)
>> {
>> struct nfs_commit_info cinfo;
>> --
>> 1.7.7.6
>>
>


Attachments:
smime.p7s (1.34 kB)

2012-05-10 20:56:16

by Adamson, Dros

[permalink] [raw]
Subject: Re: [PATCH] NFS: Prevent a deadlock in the new read and write code


On May 9, 2012, at 3:15 PM, Myklebust, Trond wrote:

> On Wed, 2012-05-09 at 18:22 +0000, Adamson, Dros wrote:
>> On May 9, 2012, at 2:20 PM, Adamson, Dros wrote:
>>
>>> ACK - This fixes the errant behavior I was seeing.
>>>
>>> Before (on any protocol version mounted at /mnt):
>>> $ dd if=/dev/zero of=zerofile bs=10024 count=1000 ; while true; do (rm -rf /mnt/foo && echo rm ok) || ls -la /mnt/foo ; mkdir -p /mnt/foo/; cp zerofile /mnt/foo/file ; done
>>> 1000+0 records in
>>> 1000+0 records out
>>> 10024000 bytes (10 MB) copied, 0.146738 s, 68.3 MB/s
>>> rm ok
>>> rm: cannot remove `/mnt/foo': Directory not empty
>>> ls: /mnt/foo/.nfs000000000004089d00000001: No such file or directory
>>> total 9800
>>> drwxrwxr-x 2 dros dros 4096 May 9 09:46 .
>>> drwxrwxrwx 5 root root 4096 May 9 09:46 ..
>>> -rw-rw-r-- 0 dros dros 10024000 May 9 09:46 .nfs000000000004089d00000001
>>> rm: cannot remove `/mnt/foo': Directory not empty
>>> ...
>>>
>>
>> Oops, this is after the patch is applied:
>>
>>> $ dd if=/dev/zero of=zerofile bs=10024 count=1000 ; while true; do (rm -rf /mnt/foo && echo rm ok) || ls -la /mnt/foo ; mkdir -p /mnt/foo/; cp zerofile /mnt/foo/file ; done
>>> 1000+0 records in
>>> 1000+0 records out
>>> 10024000 bytes (10 MB) copied, 0.137596 s, 72.9 MB/s
>>> rm ok
>>> rm ok
>>> rm ok
>>> rm ok
>>> ...
>>>
>>> -dros
>
> OK... Having thought a bit more about the problem, I'd prefer to see the
> write code do the same really as the read code is doing. i.e. release
> the nfs_page after the page_writeback lock has been released.
>
> I've just written a series of patches to do this. The first patch in the
> series is the actual fix. The rest is clean ups in order to make the
> code a bit more readable?

I tested the new patchset and it seems to work great!

-dros



Attachments:
smime.p7s (1.34 kB)

2012-05-09 18:20:40

by Adamson, Dros

[permalink] [raw]
Subject: Re: [PATCH] NFS: Prevent a deadlock in the new read and write code

ACK - This fixes the errant behavior I was seeing.

Before (on any protocol version mounted at /mnt):
$ dd if=/dev/zero of=zerofile bs=10024 count=1000 ; while true; do (rm -rf /mnt/foo && echo rm ok) || ls -la /mnt/foo ; mkdir -p /mnt/foo/; cp zerofile /mnt/foo/file ; done
1000+0 records in
1000+0 records out
10024000 bytes (10 MB) copied, 0.146738 s, 68.3 MB/s
rm ok
rm: cannot remove `/mnt/foo': Directory not empty
ls: /mnt/foo/.nfs000000000004089d00000001: No such file or directory
total 9800
drwxrwxr-x 2 dros dros 4096 May 9 09:46 .
drwxrwxrwx 5 root root 4096 May 9 09:46 ..
-rw-rw-r-- 0 dros dros 10024000 May 9 09:46 .nfs000000000004089d00000001
rm: cannot remove `/mnt/foo': Directory not empty
...

$ dd if=/dev/zero of=zerofile bs=10024 count=1000 ; while true; do (rm -rf /mnt/foo && echo rm ok) || ls -la /mnt/foo ; mkdir -p /mnt/foo/; cp zerofile /mnt/foo/file ; done
1000+0 records in
1000+0 records out
10024000 bytes (10 MB) copied, 0.137596 s, 72.9 MB/s
rm ok
rm ok
rm ok
rm ok
...

-dros

On May 9, 2012, at 12:51 PM, Trond Myklebust wrote:

> In order to avoid an ABBA lock ordering issue, the write code must
> unlock and release the nfs_page _before_ it calls end_page_writeback.
> Since the last release of the nfs_page will trigger a call to put
> the open_context, we need to ensure that something else holds a
> reference to the open context until we're done with the
> end_page_writeback so that we don't end up calling iput while we're
> still holding a PG_writeback lock.
> We can fix this by changing nfs_writedata_release to put the open
> context after we have processed all the page data.
>
> The read code does not suffer from the same problem, since it doesn't
> hold a lock on the nfs_page, so it just calls nfs_release_request after
> unlocking the page itself. However, for symmetry reasons, we make the
> same change to nfs_readdata_release.
>
> Signed-off-by: Trond Myklebust <[email protected]>
> Cc: Fred Isaman <[email protected]>
> ---
> fs/nfs/read.c | 3 ++-
> fs/nfs/write.c | 13 ++++++++++++-
> 2 files changed, 14 insertions(+), 2 deletions(-)
>
> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
> index f23cf25..dec47e5 100644
> --- a/fs/nfs/read.c
> +++ b/fs/nfs/read.c
> @@ -87,8 +87,8 @@ void nfs_readdata_release(struct nfs_read_data *rdata)
> {
> struct nfs_pgio_header *hdr = rdata->header;
> struct nfs_read_header *read_header = container_of(hdr, struct nfs_read_header, header);
> + struct nfs_open_context *ctx = rdata->args.context;
>
> - put_nfs_open_context(rdata->args.context);
> if (rdata->pages.pagevec != rdata->pages.page_array)
> kfree(rdata->pages.pagevec);
> if (rdata != &read_header->rpc_data)
> @@ -97,6 +97,7 @@ void nfs_readdata_release(struct nfs_read_data *rdata)
> rdata->header = NULL;
> if (atomic_dec_and_test(&hdr->refcnt))
> hdr->completion_ops->completion(hdr);
> + put_nfs_open_context(ctx);
> }
>
> static
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index 6f263da..69825f7 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -119,8 +119,8 @@ void nfs_writedata_release(struct nfs_write_data *wdata)
> {
> struct nfs_pgio_header *hdr = wdata->header;
> struct nfs_write_header *write_header = container_of(hdr, struct nfs_write_header, header);
> + struct nfs_open_context *ctx = wdata->args.context;
>
> - put_nfs_open_context(wdata->args.context);
> if (wdata->pages.pagevec != wdata->pages.page_array)
> kfree(wdata->pages.pagevec);
> if (wdata != &write_header->rpc_data)
> @@ -129,6 +129,10 @@ void nfs_writedata_release(struct nfs_write_data *wdata)
> wdata->header = NULL;
> if (atomic_dec_and_test(&hdr->refcnt))
> hdr->completion_ops->completion(hdr);
> + /* Note: put the open context after processing all the
> + * page data in order to avoid a deadlock in nfs_write_completion.
> + */
> + put_nfs_open_context(ctx);
> }
>
> static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
> @@ -597,6 +601,13 @@ int nfs_write_need_commit(struct nfs_write_data *data)
>
> #endif
>
> +/*
> + * nfs_write_completion - process the results of the write RPC calls
> + *
> + * Note: this function assumes that the caller is holding a reference
> + * to the nfs_open_context in order to avoid a deadlock when calling
> + * nfs_end_page_writeback after nfs_unlock_request.
> + */
> static void nfs_write_completion(struct nfs_pgio_header *hdr)
> {
> struct nfs_commit_info cinfo;
> --
> 1.7.7.6
>


Attachments:
smime.p7s (1.34 kB)