Block/SCSI layout write completion may add committable extents to the
extent tree before updating the layout's last-written byte under the inode
lock. If a sync happens before this value is updated, then
prepare_layoutcommit may find and encode these extents which would produce
a LAYOUTCOMMIT request whose encoded extents are larger than the request's
loca_length.
Fix this by holding the i_lock while marking extents writeable and updating
the value of the last written byte. Then extending the i_lock over
prepare_layoutcommit in pnfs_layoutcommit_inode() ensures we won't find
extents starting beyond the current last written byte.
Signed-off-by: Benjamin Coddington <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 3 +++
fs/nfs/pnfs.c | 4 +---
2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 17a42e4eb872..36923b55f4f8 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -343,8 +343,11 @@ static void bl_write_cleanup(struct work_struct *work)
u64 end = (hdr->args.offset + hdr->args.count +
PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
+ spin_lock(&hdr->inode->i_lock);
+ bl->bl_layout.plh_lwb = hdr->args.offset + hdr->res.count;
ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
(end - start) >> SECTOR_SHIFT);
+ spin_unlock(&hdr->inode->i_lock);
}
pnfs_ld_write_done(hdr);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 4110c1dc8f68..978df68c498c 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2386,7 +2386,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
end_pos = nfsi->layout->plh_lwb;
nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
- spin_unlock(&inode->i_lock);
data->args.inode = inode;
data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
@@ -2403,14 +2402,13 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
status = ld->prepare_layoutcommit(&data->args);
if (status) {
put_rpccred(data->cred);
- spin_lock(&inode->i_lock);
set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
if (end_pos > nfsi->layout->plh_lwb)
nfsi->layout->plh_lwb = end_pos;
goto out_unlock;
}
}
-
+ spin_unlock(&inode->i_lock);
status = nfs4_proc_layoutcommit(data, sync);
out:
--
2.5.5
Hi Ben,
Thanks for continuing to work on this.
> On Aug 18, 2016, at 11:55, Benjamin Coddington <[email protected]> wrot=
e:
>=20
> Block/SCSI layout write completion may add committable extents to the
> extent tree before updating the layout's last-written byte under the inod=
e
> lock. If a sync happens before this value is updated, then
> prepare_layoutcommit may find and encode these extents which would produc=
e
> a LAYOUTCOMMIT request whose encoded extents are larger than the request'=
s
> loca_length.
>=20
> Fix this by holding the i_lock while marking extents writeable and updati=
ng
> the value of the last written byte. Then extending the i_lock over
> prepare_layoutcommit in pnfs_layoutcommit_inode() ensures we won't find
> extents starting beyond the current last written byte.
>=20
> Signed-off-by: Benjamin Coddington <[email protected]>
> ---
> fs/nfs/blocklayout/blocklayout.c | 3 +++
> fs/nfs/pnfs.c | 4 +---
> 2 files changed, 4 insertions(+), 3 deletions(-)
>=20
> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blockl=
ayout.c
> index 17a42e4eb872..36923b55f4f8 100644
> --- a/fs/nfs/blocklayout/blocklayout.c
> +++ b/fs/nfs/blocklayout/blocklayout.c
> @@ -343,8 +343,11 @@ static void bl_write_cleanup(struct work_struct *wor=
k)
> =09=09u64 end =3D (hdr->args.offset + hdr->args.count +
> =09=09=09PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
>=20
> +=09=09spin_lock(&hdr->inode->i_lock);
> +=09=09bl->bl_layout.plh_lwb =3D hdr->args.offset + hdr->res.count;
> =09=09ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
> =09=09=09=09=09(end - start) >> SECTOR_SHIFT);
> +=09=09spin_unlock(&hdr->inode->i_lock);
> =09}
>=20
> =09pnfs_ld_write_done(hdr);
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 4110c1dc8f68..978df68c498c 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -2386,7 +2386,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool s=
ync)
> =09end_pos =3D nfsi->layout->plh_lwb;
>=20
> =09nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
> -=09spin_unlock(&inode->i_lock);
>=20
> =09data->args.inode =3D inode;
> =09data->cred =3D get_rpccred(nfsi->layout->plh_lc_cred);
> @@ -2403,14 +2402,13 @@ pnfs_layoutcommit_inode(struct inode *inode, bool=
sync)
> =09=09status =3D ld->prepare_layoutcommit(&data->args);
Doesn=92t ext_tree_prepare_commit() contain a GFP_NOFS allocation that coul=
d sleep?
> =09=09if (status) {
> =09=09=09put_rpccred(data->cred);
> -=09=09=09spin_lock(&inode->i_lock);
> =09=09=09set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
> =09=09=09if (end_pos > nfsi->layout->plh_lwb)
> =09=09=09=09nfsi->layout->plh_lwb =3D end_pos;
> =09=09=09goto out_unlock;
> =09=09}
> =09}
> -
> +=09spin_unlock(&inode->i_lock);
>=20
> =09status =3D nfs4_proc_layoutcommit(data, sync);
> out:
> --=20
> 2.5.5
>=20
On 18 Aug 2016, at 14:19, Trond Myklebust wrote:
> Hi Ben,
>
> Thanks for continuing to work on this.
>
>> On Aug 18, 2016, at 11:55, Benjamin Coddington <[email protected]>
>> wrote:
>>
>> Block/SCSI layout write completion may add committable extents to the
>> extent tree before updating the layout's last-written byte under the
>> inode
>> lock. If a sync happens before this value is updated, then
>> prepare_layoutcommit may find and encode these extents which would
>> produce
>> a LAYOUTCOMMIT request whose encoded extents are larger than the
>> request's
>> loca_length.
>>
>> Fix this by holding the i_lock while marking extents writeable and
>> updating
>> the value of the last written byte. Then extending the i_lock over
>> prepare_layoutcommit in pnfs_layoutcommit_inode() ensures we won't
>> find
>> extents starting beyond the current last written byte.
>>
>> Signed-off-by: Benjamin Coddington <[email protected]>
>> ---
>> fs/nfs/blocklayout/blocklayout.c | 3 +++
>> fs/nfs/pnfs.c | 4 +---
>> 2 files changed, 4 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/nfs/blocklayout/blocklayout.c
>> b/fs/nfs/blocklayout/blocklayout.c
>> index 17a42e4eb872..36923b55f4f8 100644
>> --- a/fs/nfs/blocklayout/blocklayout.c
>> +++ b/fs/nfs/blocklayout/blocklayout.c
>> @@ -343,8 +343,11 @@ static void bl_write_cleanup(struct work_struct
>> *work)
>> u64 end = (hdr->args.offset + hdr->args.count +
>> PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
>>
>> + spin_lock(&hdr->inode->i_lock);
>> + bl->bl_layout.plh_lwb = hdr->args.offset + hdr->res.count;
>> ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
>> (end - start) >> SECTOR_SHIFT);
>> + spin_unlock(&hdr->inode->i_lock);
>> }
>>
>> pnfs_ld_write_done(hdr);
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index 4110c1dc8f68..978df68c498c 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -2386,7 +2386,6 @@ pnfs_layoutcommit_inode(struct inode *inode,
>> bool sync)
>> end_pos = nfsi->layout->plh_lwb;
>>
>> nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
>> - spin_unlock(&inode->i_lock);
>>
>> data->args.inode = inode;
>> data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
>> @@ -2403,14 +2402,13 @@ pnfs_layoutcommit_inode(struct inode *inode,
>> bool sync)
>> status = ld->prepare_layoutcommit(&data->args);
>
> Doesn’t ext_tree_prepare_commit() contain a GFP_NOFS allocation that
> could sleep?
Ah, yes it does. I'll look for another way to fix this.
Ben
>
>> if (status) {
>> put_rpccred(data->cred);
>> - spin_lock(&inode->i_lock);
>> set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
>> if (end_pos > nfsi->layout->plh_lwb)
>> nfsi->layout->plh_lwb = end_pos;
>> goto out_unlock;
>> }
>> }
>> -
>> + spin_unlock(&inode->i_lock);
>>
>> status = nfs4_proc_layoutcommit(data, sync);
>> out:
>> --
>> 2.5.5
>>
Change from v1:
Instead of moving the i_lock protected region around, store last written
byte for block layouts on struct pnfs_block_layout and use that when
encoding LAYOUTCOMMIT.
8<------------------------------------------------------------------------
Block/SCSI layout write completion may add committable extents to the
extent tree before updating the layout's last-written byte under the inode
lock. If a sync happens before this value is updated, then
prepare_layoutcommit may find and encode these extents which would produce
a LAYOUTCOMMIT request whose encoded extents are larger than the request's
loca_length.
Fix this by using a last-written byte value that is updated atomically with
the extent tree.
Signed-off-by: Benjamin Coddington <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 2 +-
fs/nfs/blocklayout/blocklayout.h | 3 ++-
fs/nfs/blocklayout/extent_tree.c | 8 +++++---
3 files changed, 8 insertions(+), 5 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 17a42e4eb872..25cdd559831c 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -344,7 +344,7 @@ static void bl_write_cleanup(struct work_struct *work)
PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
- (end - start) >> SECTOR_SHIFT);
+ (end - start) >> SECTOR_SHIFT, end);
}
pnfs_ld_write_done(hdr);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 18e6fd0b9506..efc007f00742 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -141,6 +141,7 @@ struct pnfs_block_layout {
struct rb_root bl_ext_ro;
spinlock_t bl_ext_lock; /* Protects list manipulation */
bool bl_scsi_layout;
+ u64 bl_lwb;
};
static inline struct pnfs_block_layout *
@@ -182,7 +183,7 @@ int ext_tree_insert(struct pnfs_block_layout *bl,
int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
sector_t end);
int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
- sector_t len);
+ sector_t len, u64 lwb);
bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
struct pnfs_block_extent *ret, bool rw);
int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 992bcb19c11e..8d08dfe1e057 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -402,7 +402,7 @@ ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
int
ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
- sector_t len)
+ sector_t len, u64 lwb)
{
struct rb_root *root = &bl->bl_ext_rw;
sector_t end = start + len;
@@ -471,6 +471,7 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
}
}
out:
+ bl->bl_lwb = lwb;
spin_unlock(&bl->bl_ext_lock);
__ext_put_deviceids(&tmp);
@@ -518,7 +519,7 @@ static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
}
static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
- size_t buffer_size, size_t *count)
+ size_t buffer_size, size_t *count, __u64 *lastbyte)
{
struct pnfs_block_extent *be;
int ret = 0;
@@ -541,6 +542,7 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
else
p = encode_block_extent(be, p);
be->be_tag = EXTENT_COMMITTING;
+ *lastbyte = bl->bl_lwb - 1;
}
spin_unlock(&bl->bl_ext_lock);
@@ -564,7 +566,7 @@ ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
arg->layoutupdate_pages = &arg->layoutupdate_page;
retry:
- ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
+ ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten);
if (unlikely(ret)) {
ext_tree_free_commitdata(arg, buffer_size);
--
2.5.5
> On Aug 22, 2016, at 09:48, Benjamin Coddington <[email protected]> wrot=
e:
>=20
> Change from v1:
>=20
> Instead of moving the i_lock protected region around, store last written
> byte for block layouts on struct pnfs_block_layout and use that when
> encoding LAYOUTCOMMIT.
>=20
> 8<-----------------------------------------------------------------------=
-
>=20
> Block/SCSI layout write completion may add committable extents to the
> extent tree before updating the layout's last-written byte under the inod=
e
> lock. If a sync happens before this value is updated, then
> prepare_layoutcommit may find and encode these extents which would produc=
e
> a LAYOUTCOMMIT request whose encoded extents are larger than the request'=
s
> loca_length.
>=20
> Fix this by using a last-written byte value that is updated atomically wi=
th
> the extent tree.
>=20
> Signed-off-by: Benjamin Coddington <[email protected]>
> ---
> fs/nfs/blocklayout/blocklayout.c | 2 +-
> fs/nfs/blocklayout/blocklayout.h | 3 ++-
> fs/nfs/blocklayout/extent_tree.c | 8 +++++---
> 3 files changed, 8 insertions(+), 5 deletions(-)
>=20
> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blockl=
ayout.c
> index 17a42e4eb872..25cdd559831c 100644
> --- a/fs/nfs/blocklayout/blocklayout.c
> +++ b/fs/nfs/blocklayout/blocklayout.c
> @@ -344,7 +344,7 @@ static void bl_write_cleanup(struct work_struct *work=
)
> =09=09=09PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
>=20
> =09=09ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
> -=09=09=09=09=09(end - start) >> SECTOR_SHIFT);
> +=09=09=09=09=09(end - start) >> SECTOR_SHIFT, end);
> =09}
>=20
> =09pnfs_ld_write_done(hdr);
> diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blockl=
ayout.h
> index 18e6fd0b9506..efc007f00742 100644
> --- a/fs/nfs/blocklayout/blocklayout.h
> +++ b/fs/nfs/blocklayout/blocklayout.h
> @@ -141,6 +141,7 @@ struct pnfs_block_layout {
> =09struct rb_root=09=09bl_ext_ro;
> =09spinlock_t=09=09bl_ext_lock; /* Protects list manipulation */
> =09bool=09=09=09bl_scsi_layout;
> +=09u64=09=09=09bl_lwb;
> };
>=20
> static inline struct pnfs_block_layout *
> @@ -182,7 +183,7 @@ int ext_tree_insert(struct pnfs_block_layout *bl,
> int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start=
,
> =09=09sector_t end);
> int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
> -=09=09sector_t len);
> +=09=09sector_t len, u64 lwb);
> bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
> =09=09struct pnfs_block_extent *ret, bool rw);
> int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
> diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent=
_tree.c
> index 992bcb19c11e..8d08dfe1e057 100644
> --- a/fs/nfs/blocklayout/extent_tree.c
> +++ b/fs/nfs/blocklayout/extent_tree.c
> @@ -402,7 +402,7 @@ ext_tree_split(struct rb_root *root, struct pnfs_bloc=
k_extent *be,
>=20
> int
> ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
> -=09=09sector_t len)
> +=09=09sector_t len, u64 lwb)
> {
> =09struct rb_root *root =3D &bl->bl_ext_rw;
> =09sector_t end =3D start + len;
> @@ -471,6 +471,7 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, s=
ector_t start,
> =09=09}
> =09}
> out:
> +=09bl->bl_lwb =3D lwb;
> =09spin_unlock(&bl->bl_ext_lock);
>=20
> =09__ext_put_deviceids(&tmp);
> @@ -518,7 +519,7 @@ static __be32 *encode_scsi_range(struct pnfs_block_ex=
tent *be, __be32 *p)
> }
>=20
> static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p=
,
> -=09=09size_t buffer_size, size_t *count)
> +=09=09size_t buffer_size, size_t *count, __u64 *lastbyte)
> {
> =09struct pnfs_block_extent *be;
> =09int ret =3D 0;
> @@ -541,6 +542,7 @@ static int ext_tree_encode_commit(struct pnfs_block_l=
ayout *bl, __be32 *p,
> =09=09else
> =09=09=09p =3D encode_block_extent(be, p);
> =09=09be->be_tag =3D EXTENT_COMMITTING;
> +=09=09*lastbyte =3D bl->bl_lwb - 1;
> =09}
> =09spin_unlock(&bl->bl_ext_lock);
>=20
> @@ -564,7 +566,7 @@ ext_tree_prepare_commit(struct nfs4_layoutcommit_args=
*arg)
> =09arg->layoutupdate_pages =3D &arg->layoutupdate_page;
>=20
> retry:
> -=09ret =3D ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
> +=09ret =3D ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, =
&arg->lastbytewritten);
> =09if (unlikely(ret)) {
> =09=09ext_tree_free_commitdata(arg, buffer_size);
>=20
That looks good. Thanks!
Trond
Arg, wait, this isn't right; it doesn't handle the case where a mid-file
write follows a write that would extend the file. It needs to check if
bl_lwb < lwb before setting the value, but it also needs a way to clear
it
for every cycle of NFS_INO_LAYOUTCOMMIT.
I'll try again.
Ben
On 22 Aug 2016, at 11:34, Trond Myklebust wrote:
>> On Aug 22, 2016, at 09:48, Benjamin Coddington <[email protected]>
>> wrote:
>>
>> Change from v1:
>>
>> Instead of moving the i_lock protected region around, store last
>> written
>> byte for block layouts on struct pnfs_block_layout and use that when
>> encoding LAYOUTCOMMIT.
>>
>> 8<------------------------------------------------------------------------
>>
>> Block/SCSI layout write completion may add committable extents to the
>> extent tree before updating the layout's last-written byte under the
>> inode
>> lock. If a sync happens before this value is updated, then
>> prepare_layoutcommit may find and encode these extents which would
>> produce
>> a LAYOUTCOMMIT request whose encoded extents are larger than the
>> request's
>> loca_length.
>>
>> Fix this by using a last-written byte value that is updated
>> atomically with
>> the extent tree.
>>
>> Signed-off-by: Benjamin Coddington <[email protected]>
>> ---
>> fs/nfs/blocklayout/blocklayout.c | 2 +-
>> fs/nfs/blocklayout/blocklayout.h | 3 ++-
>> fs/nfs/blocklayout/extent_tree.c | 8 +++++---
>> 3 files changed, 8 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/nfs/blocklayout/blocklayout.c
>> b/fs/nfs/blocklayout/blocklayout.c
>> index 17a42e4eb872..25cdd559831c 100644
>> --- a/fs/nfs/blocklayout/blocklayout.c
>> +++ b/fs/nfs/blocklayout/blocklayout.c
>> @@ -344,7 +344,7 @@ static void bl_write_cleanup(struct work_struct
>> *work)
>> PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
>>
>> ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
>> - (end - start) >> SECTOR_SHIFT);
>> + (end - start) >> SECTOR_SHIFT, end);
>> }
>>
>> pnfs_ld_write_done(hdr);
>> diff --git a/fs/nfs/blocklayout/blocklayout.h
>> b/fs/nfs/blocklayout/blocklayout.h
>> index 18e6fd0b9506..efc007f00742 100644
>> --- a/fs/nfs/blocklayout/blocklayout.h
>> +++ b/fs/nfs/blocklayout/blocklayout.h
>> @@ -141,6 +141,7 @@ struct pnfs_block_layout {
>> struct rb_root bl_ext_ro;
>> spinlock_t bl_ext_lock; /* Protects list manipulation */
>> bool bl_scsi_layout;
>> + u64 bl_lwb;
>> };
>>
>> static inline struct pnfs_block_layout *
>> @@ -182,7 +183,7 @@ int ext_tree_insert(struct pnfs_block_layout *bl,
>> int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t
>> start,
>> sector_t end);
>> int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t
>> start,
>> - sector_t len);
>> + sector_t len, u64 lwb);
>> bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
>> struct pnfs_block_extent *ret, bool rw);
>> int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
>> diff --git a/fs/nfs/blocklayout/extent_tree.c
>> b/fs/nfs/blocklayout/extent_tree.c
>> index 992bcb19c11e..8d08dfe1e057 100644
>> --- a/fs/nfs/blocklayout/extent_tree.c
>> +++ b/fs/nfs/blocklayout/extent_tree.c
>> @@ -402,7 +402,7 @@ ext_tree_split(struct rb_root *root, struct
>> pnfs_block_extent *be,
>>
>> int
>> ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
>> - sector_t len)
>> + sector_t len, u64 lwb)
>> {
>> struct rb_root *root = &bl->bl_ext_rw;
>> sector_t end = start + len;
>> @@ -471,6 +471,7 @@ ext_tree_mark_written(struct pnfs_block_layout
>> *bl, sector_t start,
>> }
>> }
>> out:
>> + bl->bl_lwb = lwb;
>> spin_unlock(&bl->bl_ext_lock);
>>
>> __ext_put_deviceids(&tmp);
>> @@ -518,7 +519,7 @@ static __be32 *encode_scsi_range(struct
>> pnfs_block_extent *be, __be32 *p)
>> }
>>
>> static int ext_tree_encode_commit(struct pnfs_block_layout *bl,
>> __be32 *p,
>> - size_t buffer_size, size_t *count)
>> + size_t buffer_size, size_t *count, __u64 *lastbyte)
>> {
>> struct pnfs_block_extent *be;
>> int ret = 0;
>> @@ -541,6 +542,7 @@ static int ext_tree_encode_commit(struct
>> pnfs_block_layout *bl, __be32 *p,
>> else
>> p = encode_block_extent(be, p);
>> be->be_tag = EXTENT_COMMITTING;
>> + *lastbyte = bl->bl_lwb - 1;
>> }
>> spin_unlock(&bl->bl_ext_lock);
>>
>> @@ -564,7 +566,7 @@ ext_tree_prepare_commit(struct
>> nfs4_layoutcommit_args *arg)
>> arg->layoutupdate_pages = &arg->layoutupdate_page;
>>
>> retry:
>> - ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
>> + ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count,
>> &arg->lastbytewritten);
>> if (unlikely(ret)) {
>> ext_tree_free_commitdata(arg, buffer_size);
>>
>
> That looks good. Thanks!
>
> Trond
Change from v2:
Since writes can arrive out of order, only take lwb values larger than what
was recorded since the last write and clear lwb after encoding.
Change from v1:
Instead of moving the i_lock protected region around, store last written
byte for block layouts on struct pnfs_block_layout and use that when
encoding LAYOUTCOMMIT.
8<------------------------------------------------------------------------
Block/SCSI layout write completion may add committable extents to the
extent tree before updating the layout's last-written byte under the inode
lock. If a sync happens before this value is updated, then
prepare_layoutcommit may find and encode these extents which would produce
a LAYOUTCOMMIT request whose encoded extents are larger than the request's
loca_length.
Fix this by using a last-written byte value that is updated atomically with
the extent tree so that commitable extents always match.
Signed-off-by: Benjamin Coddington <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 2 +-
fs/nfs/blocklayout/blocklayout.h | 3 ++-
fs/nfs/blocklayout/extent_tree.c | 10 +++++++---
3 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 17a42e4eb872..25cdd559831c 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -344,7 +344,7 @@ static void bl_write_cleanup(struct work_struct *work)
PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
- (end - start) >> SECTOR_SHIFT);
+ (end - start) >> SECTOR_SHIFT, end);
}
pnfs_ld_write_done(hdr);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 18e6fd0b9506..efc007f00742 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -141,6 +141,7 @@ struct pnfs_block_layout {
struct rb_root bl_ext_ro;
spinlock_t bl_ext_lock; /* Protects list manipulation */
bool bl_scsi_layout;
+ u64 bl_lwb;
};
static inline struct pnfs_block_layout *
@@ -182,7 +183,7 @@ int ext_tree_insert(struct pnfs_block_layout *bl,
int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
sector_t end);
int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
- sector_t len);
+ sector_t len, u64 lwb);
bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
struct pnfs_block_extent *ret, bool rw);
int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 992bcb19c11e..c85fbfd2d0d9 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -402,7 +402,7 @@ ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
int
ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
- sector_t len)
+ sector_t len, u64 lwb)
{
struct rb_root *root = &bl->bl_ext_rw;
sector_t end = start + len;
@@ -471,6 +471,8 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
}
}
out:
+ if (bl->bl_lwb < lwb)
+ bl->bl_lwb = lwb;
spin_unlock(&bl->bl_ext_lock);
__ext_put_deviceids(&tmp);
@@ -518,7 +520,7 @@ static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
}
static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
- size_t buffer_size, size_t *count)
+ size_t buffer_size, size_t *count, __u64 *lastbyte)
{
struct pnfs_block_extent *be;
int ret = 0;
@@ -542,6 +544,8 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
p = encode_block_extent(be, p);
be->be_tag = EXTENT_COMMITTING;
}
+ *lastbyte = bl->bl_lwb - 1;
+ bl->bl_lwb = 0;
spin_unlock(&bl->bl_ext_lock);
return ret;
@@ -564,7 +568,7 @@ ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
arg->layoutupdate_pages = &arg->layoutupdate_page;
retry:
- ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
+ ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten);
if (unlikely(ret)) {
ext_tree_free_commitdata(arg, buffer_size);
--
2.5.5
So it looks like this patch go in, but when I did my usually NFS
regression tests that I stopped for a few weeks aover the summer it
broke down badly: all the fsx and some other data integrity tests now
fail in xfstests over pNFS blocklayout.
Given how close it is to 4.8-final can we revert it for now?
On 19 Sep 2016, at 10:58, Christoph Hellwig wrote:
> So it looks like this patch go in, but when I did my usually NFS
> regression tests that I stopped for a few weeks aover the summer it
> broke down badly: all the fsx and some other data integrity tests now
> fail in xfstests over pNFS blocklayout.
>
> Given how close it is to 4.8-final can we revert it for now?
Are you sure this patch is the problem? I just tested and things are
broken for me too, but I think it has something to do with the client
choosing the block layout over SCSI now that the server is returning
multiple layout types.
I don't have block layout configured properly, but still I don't think
I should be getting -EIO if it isn't configured. Shouldn't IO fall back
to the MDS?
It works if the server only returns SCSI layout type.. unless you're testing
something I'm not..
Just wondering if you reverted this patch and saw the problem go away.
Ben
On Mon, Sep 19, 2016 at 03:04:52PM -0400, Benjamin Coddington wrote:
> Are you sure this patch is the problem? I just tested and things are
> broken for me too, but I think it has something to do with the client
> choosing the block layout over SCSI now that the server is returning
> multiple layout types.
Yes, this is my local kvm test setup which only uses the block layout
for now (and doesn't even have the SCSI layout compiled in on the
server).
> Just wondering if you reverted this patch and saw the problem go away.
Yes, it does.
On 19 Sep 2016, at 15:06, Christoph Hellwig wrote:
> On Mon, Sep 19, 2016 at 03:04:52PM -0400, Benjamin Coddington wrote:
>> Are you sure this patch is the problem? I just tested and things are
>> broken for me too, but I think it has something to do with the client
>> choosing the block layout over SCSI now that the server is returning
>> multiple layout types.
>
> Yes, this is my local kvm test setup which only uses the block layout
> for now (and doesn't even have the SCSI layout compiled in on the
> server).
>
>> Just wondering if you reverted this patch and saw the problem go away.
>
> Yes, it does.
Rats.. well, I will investigate.
Ben
On 19 Sep 2016, at 15:40, Benjamin Coddington wrote:
> On 19 Sep 2016, at 15:06, Christoph Hellwig wrote:
>
>> On Mon, Sep 19, 2016 at 03:04:52PM -0400, Benjamin Coddington wrote:
>>> Are you sure this patch is the problem? I just tested and things are
>>> broken for me too, but I think it has something to do with the client
>>> choosing the block layout over SCSI now that the server is returning
>>> multiple layout types.
>>
>> Yes, this is my local kvm test setup which only uses the block layout
>> for now (and doesn't even have the SCSI layout compiled in on the
>> server).
>>
>>> Just wondering if you reverted this patch and saw the problem go away.
>>
>> Yes, it does.
>
> Rats.. well, I will investigate.
> diff --git a/fs/nfs/blocklayout/blocklayout.c
> b/fs/nfs/blocklayout/blocklayout.c
> index 17a42e4eb872..25cdd559831c 100644
> --- a/fs/nfs/blocklayout/blocklayout.c
> +++ b/fs/nfs/blocklayout/blocklayout.c
> @@ -344,7 +344,7 @@ static void bl_write_cleanup(struct work_struct
> *work)
> PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
>
> ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
> - (end - start) >> SECTOR_SHIFT);
> + (end - start) >> SECTOR_SHIFT, end);
> }
I have stupidly made this mistake twice now.. end is not the end of the
write here, but rather the end of the extent. My testing is broken as it's
not catching this, so now going to figure that out.
Ben
8<--------------------------------------------------------------------------------------
Commit 41963c10c47a35185e68cb9049f7a3493c94d2d7 sets the block layout's
last written byte to the offset of the end of the extent rather than the
end of the write which incorrectly updates the inode's size for
partial-page writes.
Fixes: 41963c10c47a ("pnfs/blocklayout: update last_write_offset atomically with extents")
Signed-off-by: Benjamin Coddington <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 217847679f0e..2905479f214a 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -344,9 +344,10 @@ static void bl_write_cleanup(struct work_struct *work)
u64 start = hdr->args.offset & (loff_t)PAGE_MASK;
u64 end = (hdr->args.offset + hdr->args.count +
PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
+ u64 lwb = hdr->args.offset + hdr->args.count;
ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
- (end - start) >> SECTOR_SHIFT, end);
+ (end - start) >> SECTOR_SHIFT, lwb);
}
pnfs_ld_write_done(hdr);
--
2.5.5
This seems to fix the corruption issues.
Reviewed-by: Christoph Hellwig <[email protected]>
Tested-by: Christoph Hellwig <[email protected]>