2017-06-30 20:05:20

by Olga Kornievskaia

[permalink] [raw]
Subject: [PATCH v3 1/1] PNFS fix dangling DS mount

There is a regression by commit 8d40b0f14846 ("NFS filelayout:call
GETDEVICEINFO after pnfs_layout_process completes"). It leaves the
DS mount dangling.

Previously, filelayout_alloc_sec() would call filelayout_check_layout()
which would call nfs4_find_get_deviceid which ups the count on the
device_id. It's only called once and it's matched by the
filelayout_free_lseg() that calls nfs4_fl_put_deviceid().

After that patch, each read/write ends up calling nfs4_find_get_deviceid
and there is no balance for that. Instead, do nfs4_fl_put_deviceid()
in the filelayout's .pg_cleanup and remove it from filelayout_free_lseg.

But we still need a reference to hold over the lifetime of the segment.
For every new lseg that's created we need to take a reference on deviceid
that uses it. It will be released in the "free_lseg" routine.

Signed-off-by: Olga Kornievskaia <[email protected]>
---
fs/nfs/filelayout/filelayout.c | 21 +++++++++++++++++----
fs/nfs/flexfilelayout/flexfilelayout.c | 9 ++++++---
fs/nfs/pnfs.c | 13 ++++++++++---
fs/nfs/pnfs.h | 3 ++-
4 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 1cf85d6..86d694e 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -909,9 +909,10 @@ static void _filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
struct pnfs_layout_hdr *lo;
struct nfs4_filelayout_segment *fl;
int status;
+ bool new_layout = false;

lseg = pnfs_update_layout(ino, ctx, pos, count, iomode, strict_iomode,
- gfp_flags);
+ gfp_flags, &new_layout);
if (!lseg)
lseg = ERR_PTR(-ENOMEM);
if (IS_ERR(lseg))
@@ -924,7 +925,8 @@ static void _filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
if (status) {
pnfs_put_lseg(lseg);
lseg = ERR_PTR(status);
- }
+ } else if (new_layout)
+ nfs4_get_deviceid(&fl->dsaddr->id_node);
out:
return lseg;
}
@@ -991,18 +993,29 @@ static void _filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
nfs_pageio_reset_write_mds(pgio);
}

+static void filelayout_pg_cleanup(struct nfs_pageio_descriptor *desc)
+{
+ if (desc->pg_lseg) {
+ struct nfs4_filelayout_segment *fl =
+ FILELAYOUT_LSEG(desc->pg_lseg);
+
+ nfs4_fl_put_deviceid(fl->dsaddr);
+ }
+ pnfs_generic_pg_cleanup(desc);
+}
+
static const struct nfs_pageio_ops filelayout_pg_read_ops = {
.pg_init = filelayout_pg_init_read,
.pg_test = filelayout_pg_test,
.pg_doio = pnfs_generic_pg_readpages,
- .pg_cleanup = pnfs_generic_pg_cleanup,
+ .pg_cleanup = filelayout_pg_cleanup,
};

static const struct nfs_pageio_ops filelayout_pg_write_ops = {
.pg_init = filelayout_pg_init_write,
.pg_test = filelayout_pg_test,
.pg_doio = pnfs_generic_pg_writepages,
- .pg_cleanup = pnfs_generic_pg_cleanup,
+ .pg_cleanup = filelayout_pg_cleanup,
};

static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 23542dc..53a4a19 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -820,7 +820,8 @@ static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
NFS4_MAX_UINT64,
IOMODE_READ,
strict_iomode,
- GFP_KERNEL);
+ GFP_KERNEL,
+ NULL);
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
pgio->pg_lseg = NULL;
@@ -904,7 +905,8 @@ static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
NFS4_MAX_UINT64,
IOMODE_RW,
false,
- GFP_NOFS);
+ GFP_NOFS,
+ NULL);
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
pgio->pg_lseg = NULL;
@@ -960,7 +962,8 @@ static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
NFS4_MAX_UINT64,
IOMODE_RW,
false,
- GFP_NOFS);
+ GFP_NOFS,
+ NULL);
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
pgio->pg_lseg = NULL;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c383d09..fb011c6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1698,7 +1698,8 @@ struct pnfs_layout_segment *
u64 count,
enum pnfs_iomode iomode,
bool strict_iomode,
- gfp_t gfp_flags)
+ gfp_t gfp_flags,
+ bool *new)
{
struct pnfs_layout_range arg = {
.iomode = iomode,
@@ -1764,8 +1765,12 @@ struct pnfs_layout_segment *
if (lseg) {
trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_FOUND_CACHED);
+ if (new)
+ *new = false;
goto out_unlock;
}
+ if (new)
+ *new = true;

if (!nfs4_valid_open_stateid(ctx->state)) {
trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
@@ -2126,7 +2131,8 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
rd_size,
IOMODE_READ,
false,
- GFP_KERNEL);
+ GFP_KERNEL,
+ NULL);
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
pgio->pg_lseg = NULL;
@@ -2153,7 +2159,8 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
wb_size,
IOMODE_RW,
false,
- GFP_NOFS);
+ GFP_NOFS,
+ NULL);
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
pgio->pg_lseg = NULL;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 99731e3..978fab0 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -291,7 +291,8 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
u64 count,
enum pnfs_iomode iomode,
bool strict_iomode,
- gfp_t gfp_flags);
+ gfp_t gfp_flags,
+ bool *new);
void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
const nfs4_stateid *arg_stateid,
const struct pnfs_layout_range *range,
--
1.8.3.1



2017-07-05 13:52:22

by Steve Dickson

[permalink] [raw]
Subject: Re: [PATCH v3 1/1] PNFS fix dangling DS mount



On 06/30/2017 03:52 PM, Olga Kornievskaia wrote:
> There is a regression by commit 8d40b0f14846 ("NFS filelayout:call
> GETDEVICEINFO after pnfs_layout_process completes"). It leaves the
> DS mount dangling.
>
> Previously, filelayout_alloc_sec() would call filelayout_check_layout()
> which would call nfs4_find_get_deviceid which ups the count on the
> device_id. It's only called once and it's matched by the
> filelayout_free_lseg() that calls nfs4_fl_put_deviceid().
>
> After that patch, each read/write ends up calling nfs4_find_get_deviceid
> and there is no balance for that. Instead, do nfs4_fl_put_deviceid()
> in the filelayout's .pg_cleanup and remove it from filelayout_free_lseg.
>
> But we still need a reference to hold over the lifetime of the segment.
> For every new lseg that's created we need to take a reference on deviceid
> that uses it. It will be released in the "free_lseg" routine.
>
> Signed-off-by: Olga Kornievskaia <[email protected]>
Tested-by: Steve Dickson <[email protected]>

steved.

> ---
> fs/nfs/filelayout/filelayout.c | 21 +++++++++++++++++----
> fs/nfs/flexfilelayout/flexfilelayout.c | 9 ++++++---
> fs/nfs/pnfs.c | 13 ++++++++++---
> fs/nfs/pnfs.h | 3 ++-
> 4 files changed, 35 insertions(+), 11 deletions(-)
>
> diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
> index 1cf85d6..86d694e 100644
> --- a/fs/nfs/filelayout/filelayout.c
> +++ b/fs/nfs/filelayout/filelayout.c
> @@ -909,9 +909,10 @@ static void _filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
> struct pnfs_layout_hdr *lo;
> struct nfs4_filelayout_segment *fl;
> int status;
> + bool new_layout = false;
>
> lseg = pnfs_update_layout(ino, ctx, pos, count, iomode, strict_iomode,
> - gfp_flags);
> + gfp_flags, &new_layout);
> if (!lseg)
> lseg = ERR_PTR(-ENOMEM);
> if (IS_ERR(lseg))
> @@ -924,7 +925,8 @@ static void _filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
> if (status) {
> pnfs_put_lseg(lseg);
> lseg = ERR_PTR(status);
> - }
> + } else if (new_layout)
> + nfs4_get_deviceid(&fl->dsaddr->id_node);
> out:
> return lseg;
> }
> @@ -991,18 +993,29 @@ static void _filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
> nfs_pageio_reset_write_mds(pgio);
> }
>
> +static void filelayout_pg_cleanup(struct nfs_pageio_descriptor *desc)
> +{
> + if (desc->pg_lseg) {
> + struct nfs4_filelayout_segment *fl =
> + FILELAYOUT_LSEG(desc->pg_lseg);
> +
> + nfs4_fl_put_deviceid(fl->dsaddr);
> + }
> + pnfs_generic_pg_cleanup(desc);
> +}
> +
> static const struct nfs_pageio_ops filelayout_pg_read_ops = {
> .pg_init = filelayout_pg_init_read,
> .pg_test = filelayout_pg_test,
> .pg_doio = pnfs_generic_pg_readpages,
> - .pg_cleanup = pnfs_generic_pg_cleanup,
> + .pg_cleanup = filelayout_pg_cleanup,
> };
>
> static const struct nfs_pageio_ops filelayout_pg_write_ops = {
> .pg_init = filelayout_pg_init_write,
> .pg_test = filelayout_pg_test,
> .pg_doio = pnfs_generic_pg_writepages,
> - .pg_cleanup = pnfs_generic_pg_cleanup,
> + .pg_cleanup = filelayout_pg_cleanup,
> };
>
> static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
> diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
> index 23542dc..53a4a19 100644
> --- a/fs/nfs/flexfilelayout/flexfilelayout.c
> +++ b/fs/nfs/flexfilelayout/flexfilelayout.c
> @@ -820,7 +820,8 @@ static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
> NFS4_MAX_UINT64,
> IOMODE_READ,
> strict_iomode,
> - GFP_KERNEL);
> + GFP_KERNEL,
> + NULL);
> if (IS_ERR(pgio->pg_lseg)) {
> pgio->pg_error = PTR_ERR(pgio->pg_lseg);
> pgio->pg_lseg = NULL;
> @@ -904,7 +905,8 @@ static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
> NFS4_MAX_UINT64,
> IOMODE_RW,
> false,
> - GFP_NOFS);
> + GFP_NOFS,
> + NULL);
> if (IS_ERR(pgio->pg_lseg)) {
> pgio->pg_error = PTR_ERR(pgio->pg_lseg);
> pgio->pg_lseg = NULL;
> @@ -960,7 +962,8 @@ static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
> NFS4_MAX_UINT64,
> IOMODE_RW,
> false,
> - GFP_NOFS);
> + GFP_NOFS,
> + NULL);
> if (IS_ERR(pgio->pg_lseg)) {
> pgio->pg_error = PTR_ERR(pgio->pg_lseg);
> pgio->pg_lseg = NULL;
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index c383d09..fb011c6 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -1698,7 +1698,8 @@ struct pnfs_layout_segment *
> u64 count,
> enum pnfs_iomode iomode,
> bool strict_iomode,
> - gfp_t gfp_flags)
> + gfp_t gfp_flags,
> + bool *new)
> {
> struct pnfs_layout_range arg = {
> .iomode = iomode,
> @@ -1764,8 +1765,12 @@ struct pnfs_layout_segment *
> if (lseg) {
> trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
> PNFS_UPDATE_LAYOUT_FOUND_CACHED);
> + if (new)
> + *new = false;
> goto out_unlock;
> }
> + if (new)
> + *new = true;
>
> if (!nfs4_valid_open_stateid(ctx->state)) {
> trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
> @@ -2126,7 +2131,8 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
> rd_size,
> IOMODE_READ,
> false,
> - GFP_KERNEL);
> + GFP_KERNEL,
> + NULL);
> if (IS_ERR(pgio->pg_lseg)) {
> pgio->pg_error = PTR_ERR(pgio->pg_lseg);
> pgio->pg_lseg = NULL;
> @@ -2153,7 +2159,8 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
> wb_size,
> IOMODE_RW,
> false,
> - GFP_NOFS);
> + GFP_NOFS,
> + NULL);
> if (IS_ERR(pgio->pg_lseg)) {
> pgio->pg_error = PTR_ERR(pgio->pg_lseg);
> pgio->pg_lseg = NULL;
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index 99731e3..978fab0 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -291,7 +291,8 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
> u64 count,
> enum pnfs_iomode iomode,
> bool strict_iomode,
> - gfp_t gfp_flags);
> + gfp_t gfp_flags,
> + bool *new);
> void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
> const nfs4_stateid *arg_stateid,
> const struct pnfs_layout_range *range,
>

2017-07-20 19:56:28

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v3 1/1] PNFS fix dangling DS mount

SGkgT2xnYSwNCg0KQXBvbG9naWVzIGZvciBtaXNzaW5nIHRoaXMgcGF0Y2guIEl0IHdhcyBoaWRp
bmcgaW4gbXkgJ2xpbnV4LWZzZGV2ZWwnDQptYWlsYm94LCBzbyBJIGRpZG4ndCByZWNvZ25pc2Ug
aXQgYXMgYSBORlMgcGF0Y2guDQoNCk9uIEZyaSwgMjAxNy0wNi0zMCBhdCAxNTo1MiAtMDQwMCwg
T2xnYSBLb3JuaWV2c2thaWEgd3JvdGU6DQo+IFRoZXJlIGlzIGEgcmVncmVzc2lvbiBieSBjb21t
aXQgOGQ0MGIwZjE0ODQ2ICgiTkZTIGZpbGVsYXlvdXQ6Y2FsbA0KPiBHRVRERVZJQ0VJTkZPIGFm
dGVyIHBuZnNfbGF5b3V0X3Byb2Nlc3MgY29tcGxldGVzIikuIEl0IGxlYXZlcyB0aGUNCj4gRFMg
bW91bnQgZGFuZ2xpbmcuDQo+IA0KPiBQcmV2aW91c2x5LCBmaWxlbGF5b3V0X2FsbG9jX3NlYygp
IHdvdWxkIGNhbGwNCj4gZmlsZWxheW91dF9jaGVja19sYXlvdXQoKQ0KPiB3aGljaCB3b3VsZCBj
YWxsIG5mczRfZmluZF9nZXRfZGV2aWNlaWQgd2hpY2ggdXBzIHRoZSBjb3VudCBvbiB0aGUNCj4g
ZGV2aWNlX2lkLiBJdCdzIG9ubHkgY2FsbGVkIG9uY2UgYW5kIGl0J3MgbWF0Y2hlZCBieSB0aGUN
Cj4gZmlsZWxheW91dF9mcmVlX2xzZWcoKSB0aGF0IGNhbGxzIG5mczRfZmxfcHV0X2RldmljZWlk
KCkuDQo+IA0KPiBBZnRlciB0aGF0IHBhdGNoLCBlYWNoIHJlYWQvd3JpdGUgZW5kcyB1cCBjYWxs
aW5nDQo+IG5mczRfZmluZF9nZXRfZGV2aWNlaWQNCj4gYW5kIHRoZXJlIGlzIG5vIGJhbGFuY2Ug
Zm9yIHRoYXQuIEluc3RlYWQsIGRvIG5mczRfZmxfcHV0X2RldmljZWlkKCkNCj4gaW4gdGhlIGZp
bGVsYXlvdXQncyAucGdfY2xlYW51cCBhbmQgcmVtb3ZlIGl0IGZyb20NCj4gZmlsZWxheW91dF9m
cmVlX2xzZWcuDQo+IA0KPiBCdXQgd2Ugc3RpbGwgbmVlZCBhIHJlZmVyZW5jZSB0byBob2xkIG92
ZXIgdGhlIGxpZmV0aW1lIG9mIHRoZQ0KPiBzZWdtZW50Lg0KPiBGb3IgZXZlcnkgbmV3IGxzZWcg
dGhhdCdzIGNyZWF0ZWQgd2UgbmVlZCB0byB0YWtlIGEgcmVmZXJlbmNlIG9uDQo+IGRldmljZWlk
DQo+IHRoYXQgdXNlcyBpdC4gSXQgd2lsbCBiZSByZWxlYXNlZCBpbiB0aGUgImZyZWVfbHNlZyIg
cm91dGluZS4NCg0KVGhpcyBpcyB3aGF0IEknbSBub3QgdW5kZXJzdGFuZGluZy4gSWYgeW91IGhh
dmUgYSByZWZlcmVuY2UgaW4gdGhlDQpsYXlvdXQgc2VnbWVudCwgdGhlbiB3aHkgZG8geW91IG5l
ZWQgdG8gY2FsbCBuZnM0X2ZpbmRfZ2V0X2RldmljZWlkKCkNCmluIHRoZSByZWFkL3dyaXRlIGNv
ZGU/DQoNCklzbid0IGl0IHN1ZmZpY2llbnQgdG8gY2hhbmdlIHRoZSAicGdfaW5pdCIgY2FsbHMg
dG8gY2hlY2sgd2hldGhlciBvcg0Kbm90IHRoZSBzdHJ1Y3QgbmZzNF9maWxlbGF5b3V0X3NlZ21l
bnQgaGFzIHNldCBhIHZhbHVlIGZvciBkc2FkZHIgKHRoYXQNCm5lZWRzIHRvIGJlIGRvbmUgd2l0
aCBjYXJlIHRvIGF2b2lkIHJhY2VzIC0gY21weGNoZygpIGlzIHlvdXIgZnJpZW5kKSwNCmFuZCB0
aGVuIHJlbHkgb24gdGhhdCByZWZlcmVuY2UgYmVpbmcgc2V0IGZvciB0aGUgcmVtYWluZGVyIG9m
IHRoZQ0KbGF5b3V0IHNlZ21lbnQgbGlmZXRpbWU/DQoNCg0KLS0gDQpUcm9uZCBNeWtsZWJ1c3QN
CkxpbnV4IE5GUyBjbGllbnQgbWFpbnRhaW5lciwgUHJpbWFyeURhdGENCnRyb25kLm15a2xlYnVz
dEBwcmltYXJ5ZGF0YS5jb20NCg==


2017-07-20 20:14:12

by Olga Kornievskaia

[permalink] [raw]
Subject: Re: [PATCH v3 1/1] PNFS fix dangling DS mount


> On Jul 20, 2017, at 3:56 PM, Trond Myklebust <[email protected]> =
wrote:
>=20
> Hi Olga,
>=20
> Apologies for missing this patch. It was hiding in my 'linux-fsdevel'
> mailbox, so I didn't recognise it as a NFS patch.
>=20

Yeah after I mailed it out I realized I cc-ed fsdevel incorrectly.

> On Fri, 2017-06-30 at 15:52 -0400, Olga Kornievskaia wrote:
>> There is a regression by commit 8d40b0f14846 ("NFS filelayout:call
>> GETDEVICEINFO after pnfs_layout_process completes"). It leaves the
>> DS mount dangling.
>>=20
>> Previously, filelayout_alloc_sec() would call
>> filelayout_check_layout()
>> which would call nfs4_find_get_deviceid which ups the count on the
>> device_id. It's only called once and it's matched by the
>> filelayout_free_lseg() that calls nfs4_fl_put_deviceid().
>>=20
>> After that patch, each read/write ends up calling
>> nfs4_find_get_deviceid
>> and there is no balance for that. Instead, do nfs4_fl_put_deviceid()
>> in the filelayout's .pg_cleanup and remove it from
>> filelayout_free_lseg.
>>=20
>> But we still need a reference to hold over the lifetime of the
>> segment.
>> For every new lseg that's created we need to take a reference on
>> deviceid
>> that uses it. It will be released in the "free_lseg" routine.
>=20
> This is what I'm not understanding. If you have a reference in the
> layout segment, then why do you need to call nfs4_find_get_deviceid()
> in the read/write code?

I think I=E2=80=99m probably misunderstanding the question. It sounds to =
me that you asking me for why the commit 8d40b0f14846 was done the way =
it was done (I=E2=80=99d would say it was done as per your suggestion).

I would say the call to nfs4_find_get_deviceid() has always been in the =
read/write code. It was a part of the pnfs_update_layout() but the =
commit 8d40b0f14846 moved it out of it (so that the layoutget was =
complete) and then the call to the getdeviceinfo would be done.=20

> Isn't it sufficient to change the "pg_init" calls to check whether or
> not the struct nfs4_filelayout_segment has set a value for dsaddr =
(that
> needs to be done with care to avoid races - cmpxchg() is your friend),
> and then rely on that reference being set for the remainder of the
> layout segment lifetime?

Are you suggesting to change when getdeviceinfo is suppose to be called?

>=20
>=20
> --=20
> Trond Myklebust
> Linux NFS client maintainer, PrimaryData
> [email protected]


2017-07-20 21:09:49

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v3 1/1] PNFS fix dangling DS mount

T24gVGh1LCAyMDE3LTA3LTIwIGF0IDE2OjE0IC0wNDAwLCBPbGdhIEtvcm5pZXZza2FpYSB3cm90
ZToNCj4gPiBPbiBKdWwgMjAsIDIwMTcsIGF0IDM6NTYgUE0sIFRyb25kIE15a2xlYnVzdCA8dHJv
bmRteUBwcmltYXJ5ZGF0YS5jDQo+ID4gb20+IHdyb3RlOg0KPiA+IA0KPiA+IEhpIE9sZ2EsDQo+
ID4gDQo+ID4gQXBvbG9naWVzIGZvciBtaXNzaW5nIHRoaXMgcGF0Y2guIEl0IHdhcyBoaWRpbmcg
aW4gbXkgJ2xpbnV4LQ0KPiA+IGZzZGV2ZWwnDQo+ID4gbWFpbGJveCwgc28gSSBkaWRuJ3QgcmVj
b2duaXNlIGl0IGFzIGEgTkZTIHBhdGNoLg0KPiA+IA0KPiANCj4gWWVhaCBhZnRlciBJIG1haWxl
ZCBpdCBvdXQgSSByZWFsaXplZCBJIGNjLWVkIGZzZGV2ZWwgaW5jb3JyZWN0bHkuDQo+IA0KPiA+
IE9uIEZyaSwgMjAxNy0wNi0zMCBhdCAxNTo1MiAtMDQwMCwgT2xnYSBLb3JuaWV2c2thaWEgd3Jv
dGU6DQo+ID4gPiBUaGVyZSBpcyBhIHJlZ3Jlc3Npb24gYnkgY29tbWl0IDhkNDBiMGYxNDg0NiAo
Ik5GUw0KPiA+ID4gZmlsZWxheW91dDpjYWxsDQo+ID4gPiBHRVRERVZJQ0VJTkZPIGFmdGVyIHBu
ZnNfbGF5b3V0X3Byb2Nlc3MgY29tcGxldGVzIikuIEl0IGxlYXZlcw0KPiA+ID4gdGhlDQo+ID4g
PiBEUyBtb3VudCBkYW5nbGluZy4NCj4gPiA+IA0KPiA+ID4gUHJldmlvdXNseSwgZmlsZWxheW91
dF9hbGxvY19zZWMoKSB3b3VsZCBjYWxsDQo+ID4gPiBmaWxlbGF5b3V0X2NoZWNrX2xheW91dCgp
DQo+ID4gPiB3aGljaCB3b3VsZCBjYWxsIG5mczRfZmluZF9nZXRfZGV2aWNlaWQgd2hpY2ggdXBz
IHRoZSBjb3VudCBvbg0KPiA+ID4gdGhlDQo+ID4gPiBkZXZpY2VfaWQuIEl0J3Mgb25seSBjYWxs
ZWQgb25jZSBhbmQgaXQncyBtYXRjaGVkIGJ5IHRoZQ0KPiA+ID4gZmlsZWxheW91dF9mcmVlX2xz
ZWcoKSB0aGF0IGNhbGxzIG5mczRfZmxfcHV0X2RldmljZWlkKCkuDQo+ID4gPiANCj4gPiA+IEFm
dGVyIHRoYXQgcGF0Y2gsIGVhY2ggcmVhZC93cml0ZSBlbmRzIHVwIGNhbGxpbmcNCj4gPiA+IG5m
czRfZmluZF9nZXRfZGV2aWNlaWQNCj4gPiA+IGFuZCB0aGVyZSBpcyBubyBiYWxhbmNlIGZvciB0
aGF0LiBJbnN0ZWFkLCBkbw0KPiA+ID4gbmZzNF9mbF9wdXRfZGV2aWNlaWQoKQ0KPiA+ID4gaW4g
dGhlIGZpbGVsYXlvdXQncyAucGdfY2xlYW51cCBhbmQgcmVtb3ZlIGl0IGZyb20NCj4gPiA+IGZp
bGVsYXlvdXRfZnJlZV9sc2VnLg0KPiA+ID4gDQo+ID4gPiBCdXQgd2Ugc3RpbGwgbmVlZCBhIHJl
ZmVyZW5jZSB0byBob2xkIG92ZXIgdGhlIGxpZmV0aW1lIG9mIHRoZQ0KPiA+ID4gc2VnbWVudC4N
Cj4gPiA+IEZvciBldmVyeSBuZXcgbHNlZyB0aGF0J3MgY3JlYXRlZCB3ZSBuZWVkIHRvIHRha2Ug
YSByZWZlcmVuY2Ugb24NCj4gPiA+IGRldmljZWlkDQo+ID4gPiB0aGF0IHVzZXMgaXQuIEl0IHdp
bGwgYmUgcmVsZWFzZWQgaW4gdGhlICJmcmVlX2xzZWciIHJvdXRpbmUuDQo+ID4gDQo+ID4gVGhp
cyBpcyB3aGF0IEknbSBub3QgdW5kZXJzdGFuZGluZy4gSWYgeW91IGhhdmUgYSByZWZlcmVuY2Ug
aW4gdGhlDQo+ID4gbGF5b3V0IHNlZ21lbnQsIHRoZW4gd2h5IGRvIHlvdSBuZWVkIHRvIGNhbGwN
Cj4gPiBuZnM0X2ZpbmRfZ2V0X2RldmljZWlkKCkNCj4gPiBpbiB0aGUgcmVhZC93cml0ZSBjb2Rl
Pw0KPiANCj4gSSB0aGluayBJ4oCZbSBwcm9iYWJseSBtaXN1bmRlcnN0YW5kaW5nIHRoZSBxdWVz
dGlvbi4gSXQgc291bmRzIHRvIG1lDQo+IHRoYXQgeW91IGFza2luZyBtZSBmb3Igd2h5IHRoZSBj
b21taXQgOGQ0MGIwZjE0ODQ2IHdhcyBkb25lIHRoZSB3YXkNCj4gaXQgd2FzIGRvbmUgKEnigJlk
IHdvdWxkIHNheSBpdCB3YXMgZG9uZSBhcyBwZXIgeW91ciBzdWdnZXN0aW9uKS4NCj4gDQo+IEkg
d291bGQgc2F5IHRoZSBjYWxsIHRvIG5mczRfZmluZF9nZXRfZGV2aWNlaWQoKSBoYXMgYWx3YXlz
IGJlZW4gaW4NCj4gdGhlIHJlYWQvd3JpdGUgY29kZS4gSXQgd2FzIGEgcGFydCBvZiB0aGUgcG5m
c191cGRhdGVfbGF5b3V0KCkgYnV0DQo+IHRoZSBjb21taXQgOGQ0MGIwZjE0ODQ2IG1vdmVkIGl0
IG91dCBvZiBpdCAoc28gdGhhdCB0aGUgbGF5b3V0Z2V0IHdhcw0KPiBjb21wbGV0ZSkgYW5kIHRo
ZW4gdGhlIGNhbGwgdG8gdGhlIGdldGRldmljZWluZm8gd291bGQgYmUgZG9uZS4gDQo+IA0KPiA+
IElzbid0IGl0IHN1ZmZpY2llbnQgdG8gY2hhbmdlIHRoZSAicGdfaW5pdCIgY2FsbHMgdG8gY2hl
Y2sgd2hldGhlcg0KPiA+IG9yDQo+ID4gbm90IHRoZSBzdHJ1Y3QgbmZzNF9maWxlbGF5b3V0X3Nl
Z21lbnQgaGFzIHNldCBhIHZhbHVlIGZvciBkc2FkZHINCj4gPiAodGhhdA0KPiA+IG5lZWRzIHRv
IGJlIGRvbmUgd2l0aCBjYXJlIHRvIGF2b2lkIHJhY2VzIC0gY21weGNoZygpIGlzIHlvdXINCj4g
PiBmcmllbmQpLA0KPiA+IGFuZCB0aGVuIHJlbHkgb24gdGhhdCByZWZlcmVuY2UgYmVpbmcgc2V0
IGZvciB0aGUgcmVtYWluZGVyIG9mIHRoZQ0KPiA+IGxheW91dCBzZWdtZW50IGxpZmV0aW1lPw0K
PiANCj4gQXJlIHlvdSBzdWdnZXN0aW5nIHRvIGNoYW5nZSB3aGVuIGdldGRldmljZWluZm8gaXMg
c3VwcG9zZSB0byBiZQ0KPiBjYWxsZWQ/DQo+IA0KDQpOby4gTm93IHRoYXQgSSdtIGxvb2tpbmcg
YXQgZmlsZWxheW91dF9jaGVja19kZXZpY2VpZCgpLCBoZXJlIGlzIHdoYXQgSQ0KIHN1Z2dlc3Q6
IHdlIG5lZWQgdG8gZW5zdXJlIHRoYXQgZmlsZWxheW91dF9jaGVja19kZXZpY2VpZCgpIHNldHMg
dGhlDQpkZXZpY2VpZCBvbmNlLCBhbmQgb25seSBvbmNlIQ0KDQpIb3cgYWJvdXQgc29tZXRoaW5n
IGxpa2UgdGhlIGZvbGxvd2luZyAodW50ZXN0ZWQpIHBhdGNoPw0KDQo4PC0tLS0tLS0tLS0tLS0t
LS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0NCkZyb20gMWU0MGVlMTM5
NTBkMDNhZDJhNTRhMGQxZGJhMzVmMmEwNGMyOGNhMCBNb24gU2VwIDE3IDAwOjAwOjAwIDIwMDEN
CkZyb206IFRyb25kIE15a2xlYnVzdCA8dHJvbmQubXlrbGVidXN0QHByaW1hcnlkYXRhLmNvbT4N
CkRhdGU6IFRodSwgMjAgSnVsIDIwMTcgMTc6MDA6MDIgLTA0MDANClN1YmplY3Q6IFtQQVRDSF0g
TkZTL2ZpbGVsYXlvdXQ6IEZpeCByYWN5IHNldHRpbmcgb2YgZmwtPmRzYWRkciBpbg0KIGZpbGVs
YXlvdXRfY2hlY2tfZGV2aWNlaWQoKQ0KDQpXZSBtdXN0IHNldCBmbC0+ZHNhZGRyIG9uY2UsIGFu
ZCBvbmNlIG9ubHksIGV2ZW4gaWYgdGhlcmUgYXJlIG11bHRpcGxlDQpwcm9jZXNzZXMgY2FsbGlu
ZyBmaWxlbGF5b3V0X2NoZWNrX2RldmljZWlkKCkgZm9yIHRoZSBzYW1lIGxheW91dA0Kc2VnbWVu
dC4NCg0KUmVwb3J0ZWQtYnk6IE9sZ2EgS29ybmlldnNrYWlhIDxrb2xnYUBuZXRhcHAuY29tPg0K
U2lnbmVkLW9mZi1ieTogVHJvbmQgTXlrbGVidXN0IDx0cm9uZC5teWtsZWJ1c3RAcHJpbWFyeWRh
dGEuY29tPg0KLS0tDQogZnMvbmZzL2ZpbGVsYXlvdXQvZmlsZWxheW91dC5jIHwgMTMgKysrKysr
KysrKystLQ0KIDEgZmlsZSBjaGFuZ2VkLCAxMSBpbnNlcnRpb25zKCspLCAyIGRlbGV0aW9ucygt
KQ0KDQpkaWZmIC0tZ2l0IGEvZnMvbmZzL2ZpbGVsYXlvdXQvZmlsZWxheW91dC5jIGIvZnMvbmZz
L2ZpbGVsYXlvdXQvZmlsZWxheW91dC5jDQppbmRleCAwODBmYzZiMjc4YmQuLjQ0YzYzOGI3ODc2
YyAxMDA2NDQNCi0tLSBhL2ZzL25mcy9maWxlbGF5b3V0L2ZpbGVsYXlvdXQuYw0KKysrIGIvZnMv
bmZzL2ZpbGVsYXlvdXQvZmlsZWxheW91dC5jDQpAQCAtNTQyLDYgKzU0MiwxMCBAQCBmaWxlbGF5
b3V0X2NoZWNrX2RldmljZWlkKHN0cnVjdCBwbmZzX2xheW91dF9oZHIgKmxvLA0KIAlzdHJ1Y3Qg
bmZzNF9maWxlX2xheW91dF9kc2FkZHIgKmRzYWRkcjsNCiAJaW50IHN0YXR1cyA9IC1FSU5WQUw7
DQogDQorCS8qIElzIHRoZSBkZXZpY2VpZCBhbHJlYWR5IHNldD8gSWYgc28sIHdlJ3JlIGdvb2Qu
ICovDQorCWlmIChmbC0+ZHNhZGRyICE9IE5VTEwpDQorCQlyZXR1cm4gMDsNCisNCiAJLyogZmlu
ZCBhbmQgcmVmZXJlbmNlIHRoZSBkZXZpY2VpZCAqLw0KIAlkID0gbmZzNF9maW5kX2dldF9kZXZp
Y2VpZChORlNfU0VSVkVSKGxvLT5wbGhfaW5vZGUpLCAmZmwtPmRldmljZWlkLA0KIAkJCWxvLT5w
bGhfbGNfY3JlZCwgZ2ZwX2ZsYWdzKTsNCkBAIC01NTMsOCArNTU3LDYgQEAgZmlsZWxheW91dF9j
aGVja19kZXZpY2VpZChzdHJ1Y3QgcG5mc19sYXlvdXRfaGRyICpsbywNCiAJaWYgKGZpbGVsYXlv
dXRfdGVzdF9kZXZpZF91bmF2YWlsYWJsZSgmZHNhZGRyLT5pZF9ub2RlKSkNCiAJCWdvdG8gb3V0
X3B1dDsNCiANCi0JZmwtPmRzYWRkciA9IGRzYWRkcjsNCi0NCiAJaWYgKGZsLT5maXJzdF9zdHJp
cGVfaW5kZXggPj0gZHNhZGRyLT5zdHJpcGVfY291bnQpIHsNCiAJCWRwcmludGsoIiVzIEJhZCBm
aXJzdF9zdHJpcGVfaW5kZXggJXVcbiIsDQogCQkJCV9fZnVuY19fLCBmbC0+Zmlyc3Rfc3RyaXBl
X2luZGV4KTsNCkBAIC01NzAsNiArNTcyLDEzIEBAIGZpbGVsYXlvdXRfY2hlY2tfZGV2aWNlaWQo
c3RydWN0IHBuZnNfbGF5b3V0X2hkciAqbG8sDQogCQlnb3RvIG91dF9wdXQ7DQogCX0NCiAJc3Rh
dHVzID0gMDsNCisNCisJLyoNCisJICogQXRvbWljIGNvbXBhcmUgYW5kIHhjaGFuZ2UgdG8gZW5z
dXJlIHdlIGRvbid0IHNjcmliYmxlDQorCSAqIG92ZXIgYSBub24tTlVMTCBwb2ludGVyLg0KKwkg
Ki8NCisJaWYgKGNtcHhjaGcoJmZsLT5kc2FkZHIsIE5VTEwsIGRzYWRkcikgIT0gTlVMTCkNCisJ
CWdvdG8gb3V0X3B1dDsNCiBvdXQ6DQogCXJldHVybiBzdGF0dXM7DQogb3V0X3B1dDoNCi0tIA0K
Mi4xMy4zDQoNCi0tIA0KVHJvbmQgTXlrbGVidXN0DQpMaW51eCBORlMgY2xpZW50IG1haW50YWlu
ZXIsIFByaW1hcnlEYXRhDQp0cm9uZC5teWtsZWJ1c3RAcHJpbWFyeWRhdGEuY29tDQo=


2017-07-21 15:46:49

by Olga Kornievskaia

[permalink] [raw]
Subject: Re: [PATCH v3 1/1] PNFS fix dangling DS mount

On Thu, Jul 20, 2017 at 5:09 PM, Trond Myklebust
<[email protected]> wrote:
> On Thu, 2017-07-20 at 16:14 -0400, Olga Kornievskaia wrote:
>> > On Jul 20, 2017, at 3:56 PM, Trond Myklebust <[email protected]
>> > om> wrote:
>> >
>> > Hi Olga,
>> >
>> > Apologies for missing this patch. It was hiding in my 'linux-
>> > fsdevel'
>> > mailbox, so I didn't recognise it as a NFS patch.
>> >
>>
>> Yeah after I mailed it out I realized I cc-ed fsdevel incorrectly.
>>
>> > On Fri, 2017-06-30 at 15:52 -0400, Olga Kornievskaia wrote:
>> > > There is a regression by commit 8d40b0f14846 ("NFS
>> > > filelayout:call
>> > > GETDEVICEINFO after pnfs_layout_process completes"). It leaves
>> > > the
>> > > DS mount dangling.
>> > >
>> > > Previously, filelayout_alloc_sec() would call
>> > > filelayout_check_layout()
>> > > which would call nfs4_find_get_deviceid which ups the count on
>> > > the
>> > > device_id. It's only called once and it's matched by the
>> > > filelayout_free_lseg() that calls nfs4_fl_put_deviceid().
>> > >
>> > > After that patch, each read/write ends up calling
>> > > nfs4_find_get_deviceid
>> > > and there is no balance for that. Instead, do
>> > > nfs4_fl_put_deviceid()
>> > > in the filelayout's .pg_cleanup and remove it from
>> > > filelayout_free_lseg.
>> > >
>> > > But we still need a reference to hold over the lifetime of the
>> > > segment.
>> > > For every new lseg that's created we need to take a reference on
>> > > deviceid
>> > > that uses it. It will be released in the "free_lseg" routine.
>> >
>> > This is what I'm not understanding. If you have a reference in the
>> > layout segment, then why do you need to call
>> > nfs4_find_get_deviceid()
>> > in the read/write code?
>>
>> I think I=E2=80=99m probably misunderstanding the question. It sounds to=
me
>> that you asking me for why the commit 8d40b0f14846 was done the way
>> it was done (I=E2=80=99d would say it was done as per your suggestion).
>>
>> I would say the call to nfs4_find_get_deviceid() has always been in
>> the read/write code. It was a part of the pnfs_update_layout() but
>> the commit 8d40b0f14846 moved it out of it (so that the layoutget was
>> complete) and then the call to the getdeviceinfo would be done.
>>
>> > Isn't it sufficient to change the "pg_init" calls to check whether
>> > or
>> > not the struct nfs4_filelayout_segment has set a value for dsaddr
>> > (that
>> > needs to be done with care to avoid races - cmpxchg() is your
>> > friend),
>> > and then rely on that reference being set for the remainder of the
>> > layout segment lifetime?
>>
>> Are you suggesting to change when getdeviceinfo is suppose to be
>> called?
>>
>
> No. Now that I'm looking at filelayout_check_deviceid(), here is what I
> suggest: we need to ensure that filelayout_check_deviceid() sets the
> deviceid once, and only once!
>
> How about something like the following (untested) patch?
>
> 8<-------------------------------------------------------
> From 1e40ee13950d03ad2a54a0d1dba35f2a04c28ca0 Mon Sep 17 00:00:00 2001
> From: Trond Myklebust <[email protected]>
> Date: Thu, 20 Jul 2017 17:00:02 -0400
> Subject: [PATCH] NFS/filelayout: Fix racy setting of fl->dsaddr in
> filelayout_check_deviceid()
>
> We must set fl->dsaddr once, and once only, even if there are multiple
> processes calling filelayout_check_deviceid() for the same layout
> segment.
>
> Reported-by: Olga Kornievskaia <[email protected]>
> Signed-off-by: Trond Myklebust <[email protected]>
> ---
> fs/nfs/filelayout/filelayout.c | 13 +++++++++++--
> 1 file changed, 11 insertions(+), 2 deletions(-)
>
> diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayou=
t.c
> index 080fc6b278bd..44c638b7876c 100644
> --- a/fs/nfs/filelayout/filelayout.c
> +++ b/fs/nfs/filelayout/filelayout.c
> @@ -542,6 +542,10 @@ filelayout_check_deviceid(struct pnfs_layout_hdr *lo=
,
> struct nfs4_file_layout_dsaddr *dsaddr;
> int status =3D -EINVAL;
>
> + /* Is the deviceid already set? If so, we're good. */
> + if (fl->dsaddr !=3D NULL)
> + return 0;
> +
> /* find and reference the deviceid */
> d =3D nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &fl->devi=
ceid,
> lo->plh_lc_cred, gfp_flags);
> @@ -553,8 +557,6 @@ filelayout_check_deviceid(struct pnfs_layout_hdr *lo,
> if (filelayout_test_devid_unavailable(&dsaddr->id_node))
> goto out_put;
>
> - fl->dsaddr =3D dsaddr;
> -
> if (fl->first_stripe_index >=3D dsaddr->stripe_count) {
> dprintk("%s Bad first_stripe_index %u\n",
> __func__, fl->first_stripe_index);
> @@ -570,6 +572,13 @@ filelayout_check_deviceid(struct pnfs_layout_hdr *lo=
,
> goto out_put;
> }
> status =3D 0;
> +
> + /*
> + * Atomic compare and xchange to ensure we don't scribble
> + * over a non-NULL pointer.
> + */
> + if (cmpxchg(&fl->dsaddr, NULL, dsaddr) !=3D NULL)
> + goto out_put;
> out:
> return status;
> out_put:
> --
> 2.13.3
>

No dangling DS after this patch as well.

Question: do we need to worry about that the checks that are done on
the deviceid done after finding one are skipped with this patch?