We want to drop all I/O path locks when recalling layouts, and that includes
i_mutex for the write path. Without this we get stuck processe when recalls
take too long.
Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/xfs/xfs_file.c | 4 ++--
fs/xfs/xfs_ioctl.c | 2 +-
fs/xfs/xfs_iops.c | 2 +-
fs/xfs/xfs_pnfs.c | 7 ++++++-
fs/xfs/xfs_pnfs.h | 2 +-
5 files changed, 11 insertions(+), 6 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a2e1cb8..963dfb2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -559,7 +559,7 @@ restart:
if (error)
return error;
- error = xfs_break_layouts(inode, iolock);
+ error = xfs_break_layouts(inode, iolock, true);
if (error)
return error;
@@ -843,7 +843,7 @@ xfs_file_fallocate(
return -EOPNOTSUPP;
xfs_ilock(ip, iolock);
- error = xfs_break_layouts(inode, &iolock);
+ error = xfs_break_layouts(inode, &iolock, false);
if (error)
goto out_unlock;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ac4feae..3a21cc7 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -639,7 +639,7 @@ xfs_ioc_space(
return error;
xfs_ilock(ip, iolock);
- error = xfs_break_layouts(inode, &iolock);
+ error = xfs_break_layouts(inode, &iolock, false);
if (error)
goto out_unlock;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index e53a903..79520c7 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -975,7 +975,7 @@ xfs_vn_setattr(
uint iolock = XFS_IOLOCK_EXCL;
xfs_ilock(ip, iolock);
- error = xfs_break_layouts(dentry->d_inode, &iolock);
+ error = xfs_break_layouts(dentry->d_inode, &iolock, true);
if (!error)
error = xfs_setattr_size(ip, iattr);
xfs_iunlock(ip, iolock);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 365dd57..981a657 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -31,7 +31,8 @@
int
xfs_break_layouts(
struct inode *inode,
- uint *iolock)
+ uint *iolock,
+ bool with_imutex)
{
struct xfs_inode *ip = XFS_I(inode);
int error;
@@ -40,8 +41,12 @@ xfs_break_layouts(
while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
xfs_iunlock(ip, *iolock);
+ if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
+ mutex_unlock(&inode->i_mutex);
error = break_layout(inode, true);
*iolock = XFS_IOLOCK_EXCL;
+ if (with_imutex)
+ mutex_lock(&inode->i_mutex);
xfs_ilock(ip, *iolock);
}
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index b7fbfce..f749475 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -8,7 +8,7 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
struct iattr *iattr);
-int xfs_break_layouts(struct inode *inode, uint *iolock);
+int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex);
#else
static inline int xfs_break_layouts(struct inode *inode, uint *iolock)
{
--
1.9.1
On Tue, Apr 07, 2015 at 05:35:44PM +0200, Christoph Hellwig wrote:
> We want to drop all I/O path locks when recalling layouts, and that includes
> i_mutex for the write path. Without this we get stuck processe when recalls
> take too long.
Also if the writer is an nfsd thread than we'd rather just error out
than wait.
--b.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> fs/xfs/xfs_file.c | 4 ++--
> fs/xfs/xfs_ioctl.c | 2 +-
> fs/xfs/xfs_iops.c | 2 +-
> fs/xfs/xfs_pnfs.c | 7 ++++++-
> fs/xfs/xfs_pnfs.h | 2 +-
> 5 files changed, 11 insertions(+), 6 deletions(-)
>
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index a2e1cb8..963dfb2 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -559,7 +559,7 @@ restart:
> if (error)
> return error;
>
> - error = xfs_break_layouts(inode, iolock);
> + error = xfs_break_layouts(inode, iolock, true);
> if (error)
> return error;
>
> @@ -843,7 +843,7 @@ xfs_file_fallocate(
> return -EOPNOTSUPP;
>
> xfs_ilock(ip, iolock);
> - error = xfs_break_layouts(inode, &iolock);
> + error = xfs_break_layouts(inode, &iolock, false);
> if (error)
> goto out_unlock;
>
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index ac4feae..3a21cc7 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
> @@ -639,7 +639,7 @@ xfs_ioc_space(
> return error;
>
> xfs_ilock(ip, iolock);
> - error = xfs_break_layouts(inode, &iolock);
> + error = xfs_break_layouts(inode, &iolock, false);
> if (error)
> goto out_unlock;
>
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index e53a903..79520c7 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -975,7 +975,7 @@ xfs_vn_setattr(
> uint iolock = XFS_IOLOCK_EXCL;
>
> xfs_ilock(ip, iolock);
> - error = xfs_break_layouts(dentry->d_inode, &iolock);
> + error = xfs_break_layouts(dentry->d_inode, &iolock, true);
> if (!error)
> error = xfs_setattr_size(ip, iattr);
> xfs_iunlock(ip, iolock);
> diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
> index 365dd57..981a657 100644
> --- a/fs/xfs/xfs_pnfs.c
> +++ b/fs/xfs/xfs_pnfs.c
> @@ -31,7 +31,8 @@
> int
> xfs_break_layouts(
> struct inode *inode,
> - uint *iolock)
> + uint *iolock,
> + bool with_imutex)
> {
> struct xfs_inode *ip = XFS_I(inode);
> int error;
> @@ -40,8 +41,12 @@ xfs_break_layouts(
>
> while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
> xfs_iunlock(ip, *iolock);
> + if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
> + mutex_unlock(&inode->i_mutex);
> error = break_layout(inode, true);
> *iolock = XFS_IOLOCK_EXCL;
> + if (with_imutex)
> + mutex_lock(&inode->i_mutex);
> xfs_ilock(ip, *iolock);
> }
>
> diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
> index b7fbfce..f749475 100644
> --- a/fs/xfs/xfs_pnfs.h
> +++ b/fs/xfs/xfs_pnfs.h
> @@ -8,7 +8,7 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
> int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
> struct iattr *iattr);
>
> -int xfs_break_layouts(struct inode *inode, uint *iolock);
> +int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex);
> #else
> static inline int xfs_break_layouts(struct inode *inode, uint *iolock)
> {
> --
> 1.9.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Apr 07, 2015 at 05:35:44PM +0200, Christoph Hellwig wrote:
> We want to drop all I/O path locks when recalling layouts, and that includes
> i_mutex for the write path. Without this we get stuck processe when recalls
> take too long.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
.....
> xfs_iunlock(ip, iolock);
> diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
> index 365dd57..981a657 100644
> --- a/fs/xfs/xfs_pnfs.c
> +++ b/fs/xfs/xfs_pnfs.c
> @@ -31,7 +31,8 @@
> int
> xfs_break_layouts(
> struct inode *inode,
> - uint *iolock)
> + uint *iolock,
> + bool with_imutex)
> {
> struct xfs_inode *ip = XFS_I(inode);
> int error;
> @@ -40,8 +41,12 @@ xfs_break_layouts(
>
> while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
> xfs_iunlock(ip, *iolock);
> + if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
> + mutex_unlock(&inode->i_mutex);
> error = break_layout(inode, true);
> *iolock = XFS_IOLOCK_EXCL;
> + if (with_imutex)
> + mutex_lock(&inode->i_mutex);
> xfs_ilock(ip, *iolock);
> }
That's kinda nasty, and it has no documentation explaining when or
why we'd need to drop the i_mutex. How are we supposed to know if we
need to drop the i_mutex or not? What happens if the upper VFS
layers change or we have a multiple call paths that have different
i_mutex contexts (i.e. one holds, another doesn't)?
Which makes me wonder - is this layout breaking stuff at the right
layer?
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Tue, Apr 07, 2015 at 05:07:47PM -0400, J. Bruce Fields wrote:
> On Tue, Apr 07, 2015 at 05:35:44PM +0200, Christoph Hellwig wrote:
> > We want to drop all I/O path locks when recalling layouts, and that includes
> > i_mutex for the write path. Without this we get stuck processe when recalls
> > take too long.
>
> Also if the writer is an nfsd thread than we'd rather just error out
> than wait.
We have no way to know we are called by nfsd here unfortunately.
On Wed, Apr 08, 2015 at 08:19:27AM +1000, Dave Chinner wrote:
> That's kinda nasty, and it has no documentation explaining when or
> why we'd need to drop the i_mutex. How are we supposed to know if we
> need to drop the i_mutex or not?
We need to drop it if we hold it, pretty easy.
> What happens if the upper VFS
> layers change or we have a multiple call paths that have different
> i_mutex contexts (i.e. one holds, another doesn't)?
We avoid this in the VFS, as everytime we had it filesystems were getting it
wrong.
However you have a point in that we should probably have asserts that the
right locks are held.
> Which makes me wonder - is this layout breaking stuff at the right
> layer?
We can't do it in the VFS as it needs to be atomic vs the lock that
protects write in ->write and ->fallocate, which is only taken in
the filesystem. For ->setattr in theory we could do it in the VFS,
but if the other callers can't do it in the VFS that will just lead
to code duplication.
On Wed, Apr 08, 2015 at 06:21:04PM +0200, Christoph Hellwig wrote:
> On Tue, Apr 07, 2015 at 05:07:47PM -0400, J. Bruce Fields wrote:
> > On Tue, Apr 07, 2015 at 05:35:44PM +0200, Christoph Hellwig wrote:
> > > We want to drop all I/O path locks when recalling layouts, and that includes
> > > i_mutex for the write path. Without this we get stuck processe when recalls
> > > take too long.
> >
> > Also if the writer is an nfsd thread than we'd rather just error out
> > than wait.
(To be clear: ACK to this patch as far as I'm concerned, I've got
another concern but we need this fix regardless.)
> We have no way to know we are called by nfsd here unfortunately.
I was imagining the possible deadlock here as mostly theoretical, but
now that I think of it it doesn't sound unlikely at all:
- file is under heavy write load
- conflicting operation breaks layout
- nfsd threads all block in writes to that file
- no nfsd threads available to service layout return
- recall times out, client fenced.
Ugh.
--b.