2009-03-24 19:31:58

by Peter Staubach

[permalink] [raw]
Subject: [PATCH v2] flow control for WRITE requests

--- linux-2.6.28.i586/fs/nfs/inode.c.org
+++ linux-2.6.28.i586/fs/nfs/inode.c
@@ -486,8 +486,10 @@ void nfs_setattr_update_inode(struct ino
int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
- int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ int need_atime = nfsi->cache_validity & NFS_INO_INVALID_ATIME;
int err;
+ int inc_outstanding_writes = nfs_max_outstanding_writes;

/*
* Flush out writes to the server in order to update c/mtime.
@@ -497,9 +499,14 @@ int nfs_getattr(struct vfsmount *mnt, st
* nfs_wb_nocommit.
*/
if (S_ISREG(inode->i_mode)) {
- mutex_lock(&inode->i_mutex);
- nfs_wb_nocommit(inode);
- mutex_unlock(&inode->i_mutex);
+ if (inc_outstanding_writes) {
+ atomic_add(inc_outstanding_writes, &nfsi->writes);
+ nfs_wb_nocommit(inode);
+ } else {
+ mutex_lock(&inode->i_mutex);
+ nfs_wb_nocommit(inode);
+ mutex_unlock(&inode->i_mutex);
+ }
}

/*
@@ -523,6 +530,10 @@ int nfs_getattr(struct vfsmount *mnt, st
generic_fillattr(inode, stat);
stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
}
+ if (S_ISREG(inode->i_mode) && inc_outstanding_writes) {
+ atomic_sub(inc_outstanding_writes, &nfsi->writes);
+ wake_up(&nfsi->writes_wq);
+ }
return err;
}

@@ -1288,9 +1299,13 @@ static void init_once(void *foo)
INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
nfsi->ncommit = 0;
nfsi->npages = 0;
+ atomic_set(&nfsi->ndirty, 0);
atomic_set(&nfsi->silly_count, 1);
INIT_HLIST_HEAD(&nfsi->silly_list);
init_waitqueue_head(&nfsi->waitqueue);
+ atomic_set(&nfsi->writes, 0);
+ init_waitqueue_head(&nfsi->writes_wq);
+ nfsi->wrpos = 0;
nfs4_init_once(nfsi);
}

--- linux-2.6.28.i586/fs/nfs/write.c.org
+++ linux-2.6.28.i586/fs/nfs/write.c
@@ -197,7 +197,9 @@ static int nfs_set_page_writeback(struct
if (!ret) {
struct inode *inode = page->mapping->host;
struct nfs_server *nfss = NFS_SERVER(inode);
+ struct nfs_inode *nfsi = NFS_I(inode);

+ atomic_dec(&nfsi->ndirty);
if (atomic_long_inc_return(&nfss->writeback) >
NFS_CONGESTION_ON_THRESH)
set_bdi_congested(&nfss->backing_dev_info, WRITE);
@@ -310,6 +312,33 @@ static int nfs_writepages_callback(struc
return ret;
}

+int nfs_max_outstanding_writes = 0;
+
+static void nfs_inc_outstanding_writes(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ atomic_inc(&nfsi->writes);
+}
+
+static void nfs_dec_outstanding_writes(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ if (atomic_dec_return(&nfsi->writes) < nfs_max_outstanding_writes)
+ wake_up(&nfsi->writes_wq);
+}
+
+void nfs_wait_for_outstanding_writes(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ if (nfs_max_outstanding_writes) {
+ wait_event(nfsi->writes_wq,
+ atomic_read(&nfsi->writes) < nfs_max_outstanding_writes);
+ }
+}
+
int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
@@ -369,6 +398,7 @@ static int nfs_inode_add_request(struct
SetPagePrivate(req->wb_page);
set_page_private(req->wb_page, (unsigned long)req);
nfsi->npages++;
+ atomic_inc(&nfsi->ndirty);
kref_get(&req->wb_kref);
radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
NFS_PAGE_TAG_LOCKED);
@@ -405,6 +435,10 @@ static void nfs_inode_remove_request(str
static void
nfs_mark_request_dirty(struct nfs_page *req)
{
+ struct inode *inode = req->wb_context->path.dentry->d_inode;
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ atomic_inc(&nfsi->ndirty);
__set_page_dirty_nobuffers(req->wb_page);
}

@@ -633,6 +667,7 @@ static struct nfs_page *nfs_try_to_updat
req->wb_bytes = end - req->wb_offset;
else
req->wb_bytes = rqend - req->wb_offset;
+ atomic_inc(&NFS_I(inode)->ndirty);
out_unlock:
spin_unlock(&inode->i_lock);
return req;
@@ -855,6 +890,8 @@ static int nfs_write_rpcsetup(struct nfs
count,
(unsigned long long)data->args.offset);

+ nfs_inc_outstanding_writes(inode);
+
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
@@ -1130,7 +1167,7 @@ int nfs_writeback_done(struct rpc_task *
*/
status = NFS_PROTO(data->inode)->write_done(task, data);
if (status != 0)
- return status;
+ goto out;
nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);

#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -1186,7 +1223,9 @@ int nfs_writeback_done(struct rpc_task *
/* Can't do anything about it except throw an error. */
task->tk_status = -EIO;
}
- return 0;
+out:
+ nfs_dec_outstanding_writes(data->inode);
+ return status;
}


@@ -1546,6 +1585,29 @@ int nfs_wb_page(struct inode *inode, str
return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
}

+/*
+ * Start the WRITE requests for dirty pages on their way.
+ * This is used when a sufficient number of dirty pages
+ * have accumulated.
+ */
+int nfs_wb_interim(struct inode *inode)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct writeback_control wbc = {
+ .bdi = mapping->backing_dev_info,
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = LONG_MAX,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ };
+ int ret;
+
+ ret = nfs_writepages(mapping, &wbc);
+ if (ret < 0)
+ __mark_inode_dirty(inode, I_DIRTY_PAGES);
+ return ret;
+}
+
int __init nfs_init_writepagecache(void)
{
nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
--- linux-2.6.28.i586/fs/nfs/sysctl.c.org
+++ linux-2.6.28.i586/fs/nfs/sysctl.c
@@ -58,6 +58,14 @@ static ctl_table nfs_cb_sysctls[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "nfs_max_outstanding_writes",
+ .data = &nfs_max_outstanding_writes,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
{ .ctl_name = 0 }
};

--- linux-2.6.28.i586/fs/nfs/file.c.org
+++ linux-2.6.28.i586/fs/nfs/file.c
@@ -512,11 +512,17 @@ static int nfs_need_sync_write(struct fi
return 0;
}

+static int nfs_is_serial(struct inode *inode, loff_t pos)
+{
+ return NFS_I(inode)->wrpos == pos;
+}
+
static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
- struct dentry * dentry = iocb->ki_filp->f_path.dentry;
- struct inode * inode = dentry->d_inode;
+ struct dentry *dentry = iocb->ki_filp->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+ struct nfs_inode *nfsi = NFS_I(inode);
ssize_t result;
size_t count = iov_length(iov, nr_segs);

@@ -530,6 +536,13 @@ static ssize_t nfs_file_write(struct kio
result = -EBUSY;
if (IS_SWAPFILE(inode))
goto out_swapfile;
+
+ result = count;
+ if (!count)
+ goto out;
+
+ nfs_wait_for_outstanding_writes(inode);
+
/*
* O_APPEND implies that we must revalidate the file length.
*/
@@ -539,17 +552,22 @@ static ssize_t nfs_file_write(struct kio
goto out;
}

- result = count;
- if (!count)
- goto out;
-
nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
result = generic_file_aio_write(iocb, iov, nr_segs, pos);
/* Return error values for O_SYNC and IS_SYNC() */
- if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
- int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
- if (err < 0)
- result = err;
+ if (result >= 0) {
+ if (nfs_need_sync_write(iocb->ki_filp, inode)) {
+ int err;
+ err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp),
+ inode);
+ if (err < 0)
+ result = err;
+ } else if (nfs_max_outstanding_writes &&
+ nfs_is_serial(inode, pos) &&
+ atomic_read(&nfsi->ndirty) >= NFS_SERVER(inode)->wpages)
+ nfs_wb_interim(inode);
+ if (result > 0)
+ nfsi->wrpos = pos + result;
}
out:
return result;
--- linux-2.6.28.i586/include/linux/nfs_fs.h.org
+++ linux-2.6.28.i586/include/linux/nfs_fs.h
@@ -168,6 +168,7 @@ struct nfs_inode {

unsigned long ncommit,
npages;
+ atomic_t ndirty;

/* Open contexts for shared mmap writes */
struct list_head open_files;
@@ -186,6 +187,9 @@ struct nfs_inode {
fmode_t delegation_state;
struct rw_semaphore rwsem;
#endif /* CONFIG_NFS_V4*/
+ atomic_t writes; /* number of outstanding WRITEs */
+ wait_queue_head_t writes_wq;
+ loff_t wrpos; /* position after last WRITE */
struct inode vfs_inode;
};

@@ -459,12 +463,14 @@ extern void nfs_unblock_sillyrename(stru
* linux/fs/nfs/write.c
*/
extern int nfs_congestion_kb;
+extern int nfs_max_outstanding_writes;
extern int nfs_writepage(struct page *page, struct writeback_control *wbc);
extern int nfs_writepages(struct address_space *, struct writeback_control *);
extern int nfs_flush_incompatible(struct file *file, struct page *page);
extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
-extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
+extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
extern void nfs_writedata_release(void *);
+extern void nfs_wait_for_outstanding_writes(struct inode *);

/*
* Try to write back everything synchronously (but check the
@@ -475,6 +481,7 @@ extern int nfs_wb_all(struct inode *inod
extern int nfs_wb_nocommit(struct inode *inode);
extern int nfs_wb_page(struct inode *inode, struct page* page);
extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
+extern int nfs_wb_interim(struct inode *);
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
extern int nfs_commit_inode(struct inode *, int);
extern struct nfs_write_data *nfs_commitdata_alloc(void);


Attachments:
flow_control.devel.2 (9.44 kB)

2009-03-24 21:19:22

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v2] flow control for WRITE requests

On Tue, Mar 24, 2009 at 03:31:50PM -0400, Peter Staubach wrote:
> Hi.
>
> Attached is a patch which implements some flow control for the
> NFS client to control dirty pages. The flow control is
> implemented on a per-file basis and causes dirty pages to be
> written out when the client can detect that the application is
> writing in a serial fashion and has dirtied enough pages to
> fill a complete over the wire transfer.
>
> This work was precipitated by working on a situation where a
> server at a customer site was not able to adequately handle
> the behavior of the Linux NFS client. This particular server
> required that all data to the file written to the file be
> written in a strictly serial fashion. It also had problems
> handling the Linux NFS client semantic of caching a large
> amount of data and then sending out that data all at once.
>
> The sequential ordering problem was resolved by a previous
> patch which was submitted to the linux-nfs list. This patch
> addresses the capacity problem.
>
> The problem is resolved by sending WRITE requests much
> earlier in the process of the application writing to the file.
> The client keeps track of the number of dirty pages associated
> with the file and also the last offset of the data being
> written. When the client detects that a full over the wire
> transfer could be constructed and that the application is
> writing sequentially, then it generates an UNSTABLE write to
> server for the currently dirty data.
>
> The client also keeps track of the number of these WRITE
> requests which have been generated. It flow controls based
> on a configurable maximum. This keeps the client from
> completely overwhelming the server.
>
> A nice side effect of the framework is that the issue of
> stat()'ing a file being written can be handled much more
> quickly than before. The amount of data that must be
> transmitted to the server to satisfy the "latest mtime"
> requirement is limited. Also, the application writing to
> the file is blocked until the over the wire GETATTR is
> completed. This allows the GETATTR to be send and the
> response received without competing with the data being
> written.
>
> No performance regressions were seen during informal
> performance testing.
>
> As a side note -- the more natural model of flow control
> would seem to be at the client/server level instead of
> the per-file level. However, that level was too coarse
> with the particular server that was required to be used
> because its requirements were at the per-file level.

I don't understand what you mean by "its requirements were at the
per-file level".

> The new functionality in this patch is controlled via the
> use of the sysctl, nfs_max_outstanding_writes. It defaults
> to 0, meaning no flow control and the current behaviors.
> Setting it to any non-zero value enables the functionality.
> The value of 16 seems to be a good number and aligns with
> other NFS and RPC tunables.
>
> Lastly, the functionality of starting WRITE requests sooner
> to smooth out the i/o pattern should probably be done by the
> VM subsystem. I am looking into this, but in the meantime
> and to solve the immediate problem, this support is proposed.

It seems unfortunate if we add a sysctl to work around a problem that
ends up being fixed some other way a version or two later.

Would be great to have some progress on these problems, though....

--b.

2009-03-25 13:15:57

by Peter Staubach

[permalink] [raw]
Subject: Re: [PATCH v2] flow control for WRITE requests

J. Bruce Fields wrote:
> On Tue, Mar 24, 2009 at 03:31:50PM -0400, Peter Staubach wrote:
>
>> Hi.
>>
>> Attached is a patch which implements some flow control for the
>> NFS client to control dirty pages. The flow control is
>> implemented on a per-file basis and causes dirty pages to be
>> written out when the client can detect that the application is
>> writing in a serial fashion and has dirtied enough pages to
>> fill a complete over the wire transfer.
>>
>> This work was precipitated by working on a situation where a
>> server at a customer site was not able to adequately handle
>> the behavior of the Linux NFS client. This particular server
>> required that all data to the file written to the file be
>> written in a strictly serial fashion. It also had problems
>> handling the Linux NFS client semantic of caching a large
>> amount of data and then sending out that data all at once.
>>
>> The sequential ordering problem was resolved by a previous
>> patch which was submitted to the linux-nfs list. This patch
>> addresses the capacity problem.
>>
>> The problem is resolved by sending WRITE requests much
>> earlier in the process of the application writing to the file.
>> The client keeps track of the number of dirty pages associated
>> with the file and also the last offset of the data being
>> written. When the client detects that a full over the wire
>> transfer could be constructed and that the application is
>> writing sequentially, then it generates an UNSTABLE write to
>> server for the currently dirty data.
>>
>> The client also keeps track of the number of these WRITE
>> requests which have been generated. It flow controls based
>> on a configurable maximum. This keeps the client from
>> completely overwhelming the server.
>>
>> A nice side effect of the framework is that the issue of
>> stat()'ing a file being written can be handled much more
>> quickly than before. The amount of data that must be
>> transmitted to the server to satisfy the "latest mtime"
>> requirement is limited. Also, the application writing to
>> the file is blocked until the over the wire GETATTR is
>> completed. This allows the GETATTR to be send and the
>> response received without competing with the data being
>> written.
>>
>> No performance regressions were seen during informal
>> performance testing.
>>
>> As a side note -- the more natural model of flow control
>> would seem to be at the client/server level instead of
>> the per-file level. However, that level was too coarse
>> with the particular server that was required to be used
>> because its requirements were at the per-file level.
>>
>
> I don't understand what you mean by "its requirements were at the
> per-file level".
>
>

This particular server had a cache for WRITE requests on a
per-file basis. It could only write data to the underlying
file on a strictly sequential basis because the underlying
file was a variable length record based file. Out of order
data coming from the client was cached until the correct
data had been received.

>> The new functionality in this patch is controlled via the
>> use of the sysctl, nfs_max_outstanding_writes. It defaults
>> to 0, meaning no flow control and the current behaviors.
>> Setting it to any non-zero value enables the functionality.
>> The value of 16 seems to be a good number and aligns with
>> other NFS and RPC tunables.
>>
>> Lastly, the functionality of starting WRITE requests sooner
>> to smooth out the i/o pattern should probably be done by the
>> VM subsystem. I am looking into this, but in the meantime
>> and to solve the immediate problem, this support is proposed.
>>
>
> It seems unfortunate if we add a sysctl to work around a problem that
> ends up being fixed some other way a version or two later.
>
> Would be great to have some progress on these problems, though....

I agree with this. We need something and waiting for the final,
perfect solution won't help the situation that I was asked to
look at.

I suspect that we will need some way to control the number of
outstanding WRITE requests, no matter what the underlying
mechanism ends up being. This will need to be at least at the
granularity of a per-file system. System wide will be too
coarse.

Thanx...

ps


2009-05-27 19:18:44

by Peter Staubach

[permalink] [raw]
Subject: Re: [PATCH v2] flow control for WRITE requests

J. Bruce Fields wrote:
> On Tue, Mar 24, 2009 at 03:31:50PM -0400, Peter Staubach wrote:
>
>> Hi.
>>
>> Attached is a patch which implements some flow control for the
>> NFS client to control dirty pages. The flow control is
>> implemented on a per-file basis and causes dirty pages to be
>> written out when the client can detect that the application is
>> writing in a serial fashion and has dirtied enough pages to
>> fill a complete over the wire transfer.
>>
>> This work was precipitated by working on a situation where a
>> server at a customer site was not able to adequately handle
>> the behavior of the Linux NFS client. This particular server
>> required that all data to the file written to the file be
>> written in a strictly serial fashion. It also had problems
>> handling the Linux NFS client semantic of caching a large
>> amount of data and then sending out that data all at once.
>>
>> The sequential ordering problem was resolved by a previous
>> patch which was submitted to the linux-nfs list. This patch
>> addresses the capacity problem.
>>
>> The problem is resolved by sending WRITE requests much
>> earlier in the process of the application writing to the file.
>> The client keeps track of the number of dirty pages associated
>> with the file and also the last offset of the data being
>> written. When the client detects that a full over the wire
>> transfer could be constructed and that the application is
>> writing sequentially, then it generates an UNSTABLE write to
>> server for the currently dirty data.
>>
>> The client also keeps track of the number of these WRITE
>> requests which have been generated. It flow controls based
>> on a configurable maximum. This keeps the client from
>> completely overwhelming the server.
>>
>> A nice side effect of the framework is that the issue of
>> stat()'ing a file being written can be handled much more
>> quickly than before. The amount of data that must be
>> transmitted to the server to satisfy the "latest mtime"
>> requirement is limited. Also, the application writing to
>> the file is blocked until the over the wire GETATTR is
>> completed. This allows the GETATTR to be send and the
>> response received without competing with the data being
>> written.
>>
>> No performance regressions were seen during informal
>> performance testing.
>>
>> As a side note -- the more natural model of flow control
>> would seem to be at the client/server level instead of
>> the per-file level. However, that level was too coarse
>> with the particular server that was required to be used
>> because its requirements were at the per-file level.
>>
>
> I don't understand what you mean by "its requirements were at the
> per-file level".
>
>
>> The new functionality in this patch is controlled via the
>> use of the sysctl, nfs_max_outstanding_writes. It defaults
>> to 0, meaning no flow control and the current behaviors.
>> Setting it to any non-zero value enables the functionality.
>> The value of 16 seems to be a good number and aligns with
>> other NFS and RPC tunables.
>>
>> Lastly, the functionality of starting WRITE requests sooner
>> to smooth out the i/o pattern should probably be done by the
>> VM subsystem. I am looking into this, but in the meantime
>> and to solve the immediate problem, this support is proposed.
>>
>
> It seems unfortunate if we add a sysctl to work around a problem that
> ends up being fixed some other way a version or two later.
>
> Would be great to have some progress on these problems, though....
>
> --b.
>

Hi.

I have attached a new testcase which exhibits this particular
situation. One script writes out 6 ~1GB files in parallel,
while the other script is simultaneously running an "ls -l"
in the directory.

When run on a system large enough to store all ~6GB of data,
the dd processes basically write(2) all of their data into
memory very quickly and then spend most of their time in the
close(2) system call flushing the page cache due to the close
to open processing.

The current flow control support in the NFS client does not work
well for this situation. It was designed to catch the process
filling memory and to block it while the page cache flush is
being done by the process doing the stat(2).

The problem with this approach is that there could potentially be
gigabytes of page cache which needs to be flushed to the server
during the stat(2) processing. This blocks the application
doing the stat(2) for potentially a very long time, based on the
amount of data which was cached, the speed of the network, and
the speed of the server.

The solution is to limit the amount of data that must be flushed
during the stat(2) call. This can be done by starting i/o when
the application has filled enough pages to fill an entire wsize'd
transfer and by limiting the number of these transfers which are
outstanding so as not to overwhelm the server.

-----------

While it seems that it would be good to have this done by the
VM itself, the current architecture of the VM does not seem to
yield itself easily to doing this. It seems like doing something
like a per-file bdi would do the trick, however the system is
not scalable to the number of bdi's that that would require.

I am open to suggestions for alternate solutions, but in the
meantime, this support does seem to address the situation. In
my test environment, it also increases, significantly,
performance when sequentially writing large files. My throughput
when dd'ing /dev/sda1 to an NFS mounted file went from ~22MB/s
to ~38MB/s. (I do this for image backups for my laptop.) Your
mileage may vary however. :-)

So, we can consider taking this so that we can address some
customer needs?

Thanx...

ps


Attachments:
master.sh (133.00 B)
reader.sh (89.00 B)
Download all attachments

2009-05-27 20:45:54

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v2] flow control for WRITE requests

On Wed, 2009-05-27 at 15:18 -0400, Peter Staubach wrote:
> J. Bruce Fields wrote:
> > On Tue, Mar 24, 2009 at 03:31:50PM -0400, Peter Staubach wrote:
> >
> >> Hi.
> >>
> >> Attached is a patch which implements some flow control for the
> >> NFS client to control dirty pages. The flow control is
> >> implemented on a per-file basis and causes dirty pages to be
> >> written out when the client can detect that the application is
> >> writing in a serial fashion and has dirtied enough pages to
> >> fill a complete over the wire transfer.
> >>
> >> This work was precipitated by working on a situation where a
> >> server at a customer site was not able to adequately handle
> >> the behavior of the Linux NFS client. This particular server
> >> required that all data to the file written to the file be
> >> written in a strictly serial fashion. It also had problems
> >> handling the Linux NFS client semantic of caching a large
> >> amount of data and then sending out that data all at once.
> >>
> >> The sequential ordering problem was resolved by a previous
> >> patch which was submitted to the linux-nfs list. This patch
> >> addresses the capacity problem.
> >>
> >> The problem is resolved by sending WRITE requests much
> >> earlier in the process of the application writing to the file.
> >> The client keeps track of the number of dirty pages associated
> >> with the file and also the last offset of the data being
> >> written. When the client detects that a full over the wire
> >> transfer could be constructed and that the application is
> >> writing sequentially, then it generates an UNSTABLE write to
> >> server for the currently dirty data.
> >>
> >> The client also keeps track of the number of these WRITE
> >> requests which have been generated. It flow controls based
> >> on a configurable maximum. This keeps the client from
> >> completely overwhelming the server.
> >>
> >> A nice side effect of the framework is that the issue of
> >> stat()'ing a file being written can be handled much more
> >> quickly than before. The amount of data that must be
> >> transmitted to the server to satisfy the "latest mtime"
> >> requirement is limited. Also, the application writing to
> >> the file is blocked until the over the wire GETATTR is
> >> completed. This allows the GETATTR to be send and the
> >> response received without competing with the data being
> >> written.
> >>
> >> No performance regressions were seen during informal
> >> performance testing.
> >>
> >> As a side note -- the more natural model of flow control
> >> would seem to be at the client/server level instead of
> >> the per-file level. However, that level was too coarse
> >> with the particular server that was required to be used
> >> because its requirements were at the per-file level.
> >>
> >
> > I don't understand what you mean by "its requirements were at the
> > per-file level".
> >
> >
> >> The new functionality in this patch is controlled via the
> >> use of the sysctl, nfs_max_outstanding_writes. It defaults
> >> to 0, meaning no flow control and the current behaviors.
> >> Setting it to any non-zero value enables the functionality.
> >> The value of 16 seems to be a good number and aligns with
> >> other NFS and RPC tunables.
> >>
> >> Lastly, the functionality of starting WRITE requests sooner
> >> to smooth out the i/o pattern should probably be done by the
> >> VM subsystem. I am looking into this, but in the meantime
> >> and to solve the immediate problem, this support is proposed.
> >>
> >
> > It seems unfortunate if we add a sysctl to work around a problem that
> > ends up being fixed some other way a version or two later.
> >
> > Would be great to have some progress on these problems, though....
> >
> > --b.
> >
>
> Hi.
>
> I have attached a new testcase which exhibits this particular
> situation. One script writes out 6 ~1GB files in parallel,
> while the other script is simultaneously running an "ls -l"
> in the directory.
>
> When run on a system large enough to store all ~6GB of data,
> the dd processes basically write(2) all of their data into
> memory very quickly and then spend most of their time in the
> close(2) system call flushing the page cache due to the close
> to open processing.
>
> The current flow control support in the NFS client does not work
> well for this situation. It was designed to catch the process
> filling memory and to block it while the page cache flush is
> being done by the process doing the stat(2).
>
> The problem with this approach is that there could potentially be
> gigabytes of page cache which needs to be flushed to the server
> during the stat(2) processing. This blocks the application
> doing the stat(2) for potentially a very long time, based on the
> amount of data which was cached, the speed of the network, and
> the speed of the server.
>
> The solution is to limit the amount of data that must be flushed
> during the stat(2) call. This can be done by starting i/o when
> the application has filled enough pages to fill an entire wsize'd
> transfer and by limiting the number of these transfers which are
> outstanding so as not to overwhelm the server.
>
> -----------
>
> While it seems that it would be good to have this done by the
> VM itself, the current architecture of the VM does not seem to
> yield itself easily to doing this. It seems like doing something
> like a per-file bdi would do the trick, however the system is
> not scalable to the number of bdi's that that would require.
>
> I am open to suggestions for alternate solutions, but in the
> meantime, this support does seem to address the situation. In
> my test environment, it also increases, significantly,
> performance when sequentially writing large files. My throughput
> when dd'ing /dev/sda1 to an NFS mounted file went from ~22MB/s
> to ~38MB/s. (I do this for image backups for my laptop.) Your
> mileage may vary however. :-)
>
> So, we can consider taking this so that we can address some
> customer needs?


In the above mail, you are justifying the patch out of concern for
stat() behaviour, but (unless I'm looking at an outdated version) that
is clearly not what has driven the design.
For instance, the call to nfs_wait_for_outstanding_writes() seems to be
unnecessary to fix the issue of flow control in stat() to which you
refer above, and is likely to be detrimental to write() performance.
Also, you have the nfs_is_serial() heuristic, which turns it all off in
the random writeback case. Again, that seems to have little to do with
fixing stat().
I realise that your main motivation is to address the needs of the
customer in question, but I'm still not convinced that this is the right
way to do it.

To address the actual issue of WRITE request reordering, do we know why
the NFS client is generating out of order RPCs? Is it just reordering
within the RPC layer, or is it something else? For instance, I seem to
recollect that Chris Mason mentioned WB_SYNC_NONE, as being a major
source of non-linearity when he looked at btrfs. I can imagine that when
you combine that with the use of the 'range_cyclic' flag in
writeback_control, then you will get all sorts of "interesting" request
orders...

Cheers,
Trond


2009-05-28 15:41:19

by Peter Staubach

[permalink] [raw]
Subject: Re: [PATCH v2] flow control for WRITE requests

Trond Myklebust wrote:
> On Wed, 2009-05-27 at 15:18 -0400, Peter Staubach wrote:
>
>> J. Bruce Fields wrote:
>>
>>> On Tue, Mar 24, 2009 at 03:31:50PM -0400, Peter Staubach wrote:
>>>
>>>
>>>> Hi.
>>>>
>>>> Attached is a patch which implements some flow control for the
>>>> NFS client to control dirty pages. The flow control is
>>>> implemented on a per-file basis and causes dirty pages to be
>>>> written out when the client can detect that the application is
>>>> writing in a serial fashion and has dirtied enough pages to
>>>> fill a complete over the wire transfer.
>>>>
>>>> This work was precipitated by working on a situation where a
>>>> server at a customer site was not able to adequately handle
>>>> the behavior of the Linux NFS client. This particular server
>>>> required that all data to the file written to the file be
>>>> written in a strictly serial fashion. It also had problems
>>>> handling the Linux NFS client semantic of caching a large
>>>> amount of data and then sending out that data all at once.
>>>>
>>>> The sequential ordering problem was resolved by a previous
>>>> patch which was submitted to the linux-nfs list. This patch
>>>> addresses the capacity problem.
>>>>
>>>> The problem is resolved by sending WRITE requests much
>>>> earlier in the process of the application writing to the file.
>>>> The client keeps track of the number of dirty pages associated
>>>> with the file and also the last offset of the data being
>>>> written. When the client detects that a full over the wire
>>>> transfer could be constructed and that the application is
>>>> writing sequentially, then it generates an UNSTABLE write to
>>>> server for the currently dirty data.
>>>>
>>>> The client also keeps track of the number of these WRITE
>>>> requests which have been generated. It flow controls based
>>>> on a configurable maximum. This keeps the client from
>>>> completely overwhelming the server.
>>>>
>>>> A nice side effect of the framework is that the issue of
>>>> stat()'ing a file being written can be handled much more
>>>> quickly than before. The amount of data that must be
>>>> transmitted to the server to satisfy the "latest mtime"
>>>> requirement is limited. Also, the application writing to
>>>> the file is blocked until the over the wire GETATTR is
>>>> completed. This allows the GETATTR to be send and the
>>>> response received without competing with the data being
>>>> written.
>>>>
>>>> No performance regressions were seen during informal
>>>> performance testing.
>>>>
>>>> As a side note -- the more natural model of flow control
>>>> would seem to be at the client/server level instead of
>>>> the per-file level. However, that level was too coarse
>>>> with the particular server that was required to be used
>>>> because its requirements were at the per-file level.
>>>>
>>>>
>>> I don't understand what you mean by "its requirements were at the
>>> per-file level".
>>>
>>>
>>>
>>>> The new functionality in this patch is controlled via the
>>>> use of the sysctl, nfs_max_outstanding_writes. It defaults
>>>> to 0, meaning no flow control and the current behaviors.
>>>> Setting it to any non-zero value enables the functionality.
>>>> The value of 16 seems to be a good number and aligns with
>>>> other NFS and RPC tunables.
>>>>
>>>> Lastly, the functionality of starting WRITE requests sooner
>>>> to smooth out the i/o pattern should probably be done by the
>>>> VM subsystem. I am looking into this, but in the meantime
>>>> and to solve the immediate problem, this support is proposed.
>>>>
>>>>
>>> It seems unfortunate if we add a sysctl to work around a problem that
>>> ends up being fixed some other way a version or two later.
>>>
>>> Would be great to have some progress on these problems, though....
>>>
>>> --b.
>>>
>>>
>> Hi.
>>
>> I have attached a new testcase which exhibits this particular
>> situation. One script writes out 6 ~1GB files in parallel,
>> while the other script is simultaneously running an "ls -l"
>> in the directory.
>>
>> When run on a system large enough to store all ~6GB of data,
>> the dd processes basically write(2) all of their data into
>> memory very quickly and then spend most of their time in the
>> close(2) system call flushing the page cache due to the close
>> to open processing.
>>
>> The current flow control support in the NFS client does not work
>> well for this situation. It was designed to catch the process
>> filling memory and to block it while the page cache flush is
>> being done by the process doing the stat(2).
>>
>> The problem with this approach is that there could potentially be
>> gigabytes of page cache which needs to be flushed to the server
>> during the stat(2) processing. This blocks the application
>> doing the stat(2) for potentially a very long time, based on the
>> amount of data which was cached, the speed of the network, and
>> the speed of the server.
>>
>> The solution is to limit the amount of data that must be flushed
>> during the stat(2) call. This can be done by starting i/o when
>> the application has filled enough pages to fill an entire wsize'd
>> transfer and by limiting the number of these transfers which are
>> outstanding so as not to overwhelm the server.
>>
>> -----------
>>
>> While it seems that it would be good to have this done by the
>> VM itself, the current architecture of the VM does not seem to
>> yield itself easily to doing this. It seems like doing something
>> like a per-file bdi would do the trick, however the system is
>> not scalable to the number of bdi's that that would require.
>>
>> I am open to suggestions for alternate solutions, but in the
>> meantime, this support does seem to address the situation. In
>> my test environment, it also increases, significantly,
>> performance when sequentially writing large files. My throughput
>> when dd'ing /dev/sda1 to an NFS mounted file went from ~22MB/s
>> to ~38MB/s. (I do this for image backups for my laptop.) Your
>> mileage may vary however. :-)
>>
>> So, we can consider taking this so that we can address some
>> customer needs?
>>
>
>
> In the above mail, you are justifying the patch out of concern for
> stat() behaviour, but (unless I'm looking at an outdated version) that
> is clearly not what has driven the design.
> For instance, the call to nfs_wait_for_outstanding_writes() seems to be
> unnecessary to fix the issue of flow control in stat() to which you
> refer above, and is likely to be detrimental to write() performance.
> Also, you have the nfs_is_serial() heuristic, which turns it all off in
> the random writeback case. Again, that seems to have little to do with
> fixing stat().
> I realise that your main motivation is to address the needs of the
> customer in question, but I'm still not convinced that this is the right
> way to do it.
>
>

Actually, I was able to solve the stat() problem as a side
effect of the original design, but it seemed like additional
reasons for wanting this code integrated.

Yes, part of the architecture is to smooth the WRITE traffic
and to keep from overwhelming the server. This is what the
nfs_wait_for_outstanding_writes() does.

I could update the changelog to mention that this support is
disabled if the NFS client detects random access to the file.
I added that so that applications such as databases wouldn't
be harmed. I guess that I just took that sort of thing for
granted and didn't think about it much further.

> To address the actual issue of WRITE request reordering, do we know why
> the NFS client is generating out of order RPCs? Is it just reordering
> within the RPC layer, or is it something else? For instance, I seem to
> recollect that Chris Mason mentioned WB_SYNC_NONE, as being a major
> source of non-linearity when he looked at btrfs. I can imagine that when
> you combine that with the use of the 'range_cyclic' flag in
> writeback_control, then you will get all sorts of "interesting" request
> orders...

This version of the support does not address WRITE request
reordering. The other changes to system plus the NFS_INO_FLUSHING
support that you added seems to address this in as much as I
don't see out of order WRITE requests anymore.

-----

I am trying to do accomplish two things here. The first thing
was to smooth the WRITE traffic so that the client would perform
better. Caching a few gigabytes of data and then flushing it to
the server using a firehose doesn't seem to work very well. In
a customer situation, I really had a server which could not keep
up with the client. Something was needed to better match the
client and server bandwidths.

Second, I noticed that the architecture to smooth the WRITE
traffic and do the flow control could be used very nicely to
solve the stat() problem too. The smoothing of the WRITE
traffic results in fewer dirty cached pages which need to get
flushed to the server during the stat() processing. This helps
to reduce the latency of the stat() call. Next, the flow control
aspect can be used to block the application which is writing to
the file while the application. It happens without adding any
more code to the writing path.

I have spent quite a bit of time trying to measure the performance
impact. As far as I can see, it varies from significantly better
to no affect. Some things like dd run much better in my test
network. Other things like rpmbuild don't appear to be affected.
Compilations tend to be random access to files and are generally
more cpu limited than i/o bound.

-----

I'd be happy to chat about any other ideas for ways to solve
the issues that I need to solve. At the moment, there is a
customer who is quite interested in getting the stat() problem
resolved. (He may come to your attention from another
direction as well.) We've given him a workaround, which may
end up being his solution, but that workaround won't work for
all of the rest of the people who have complained about the
stat() problem. Adding the locking for i_mutex around the
page flushing did help to ensure that the stat() processing
eventually succeeds, but left some problems with very large
latencies while waiting for large page caches to flush. Even
on my little 4GB system, those latencies can be 10s of seconds
or more. This is not generally acceptable by our users.

Thanx...

ps

2009-05-28 15:49:51

by Chuck Lever

[permalink] [raw]
Subject: Re: [PATCH v2] flow control for WRITE requests

On May 28, 2009, at 11:41 AM, Peter Staubach wrote:
> Trond Myklebust wrote:
>> On Wed, 2009-05-27 at 15:18 -0400, Peter Staubach wrote:
>>
>>> J. Bruce Fields wrote:
>>>
>>>> On Tue, Mar 24, 2009 at 03:31:50PM -0400, Peter Staubach wrote:
>>>>
>>>>
>>>>> Hi.
>>>>>
>>>>> Attached is a patch which implements some flow control for the
>>>>> NFS client to control dirty pages. The flow control is
>>>>> implemented on a per-file basis and causes dirty pages to be
>>>>> written out when the client can detect that the application is
>>>>> writing in a serial fashion and has dirtied enough pages to
>>>>> fill a complete over the wire transfer.
>>>>>
>>>>> This work was precipitated by working on a situation where a
>>>>> server at a customer site was not able to adequately handle
>>>>> the behavior of the Linux NFS client. This particular server
>>>>> required that all data to the file written to the file be
>>>>> written in a strictly serial fashion. It also had problems
>>>>> handling the Linux NFS client semantic of caching a large
>>>>> amount of data and then sending out that data all at once.
>>>>>
>>>>> The sequential ordering problem was resolved by a previous
>>>>> patch which was submitted to the linux-nfs list. This patch
>>>>> addresses the capacity problem.
>>>>>
>>>>> The problem is resolved by sending WRITE requests much
>>>>> earlier in the process of the application writing to the file.
>>>>> The client keeps track of the number of dirty pages associated
>>>>> with the file and also the last offset of the data being
>>>>> written. When the client detects that a full over the wire
>>>>> transfer could be constructed and that the application is
>>>>> writing sequentially, then it generates an UNSTABLE write to
>>>>> server for the currently dirty data.
>>>>>
>>>>> The client also keeps track of the number of these WRITE
>>>>> requests which have been generated. It flow controls based
>>>>> on a configurable maximum. This keeps the client from
>>>>> completely overwhelming the server.
>>>>>
>>>>> A nice side effect of the framework is that the issue of
>>>>> stat()'ing a file being written can be handled much more
>>>>> quickly than before. The amount of data that must be
>>>>> transmitted to the server to satisfy the "latest mtime"
>>>>> requirement is limited. Also, the application writing to
>>>>> the file is blocked until the over the wire GETATTR is
>>>>> completed. This allows the GETATTR to be send and the
>>>>> response received without competing with the data being
>>>>> written.
>>>>>
>>>>> No performance regressions were seen during informal
>>>>> performance testing.
>>>>>
>>>>> As a side note -- the more natural model of flow control
>>>>> would seem to be at the client/server level instead of
>>>>> the per-file level. However, that level was too coarse
>>>>> with the particular server that was required to be used
>>>>> because its requirements were at the per-file level.
>>>>>
>>>>>
>>>> I don't understand what you mean by "its requirements were at the
>>>> per-file level".
>>>>
>>>>
>>>>
>>>>> The new functionality in this patch is controlled via the
>>>>> use of the sysctl, nfs_max_outstanding_writes. It defaults
>>>>> to 0, meaning no flow control and the current behaviors.
>>>>> Setting it to any non-zero value enables the functionality.
>>>>> The value of 16 seems to be a good number and aligns with
>>>>> other NFS and RPC tunables.
>>>>>
>>>>> Lastly, the functionality of starting WRITE requests sooner
>>>>> to smooth out the i/o pattern should probably be done by the
>>>>> VM subsystem. I am looking into this, but in the meantime
>>>>> and to solve the immediate problem, this support is proposed.
>>>>>
>>>>>
>>>> It seems unfortunate if we add a sysctl to work around a problem
>>>> that
>>>> ends up being fixed some other way a version or two later.
>>>>
>>>> Would be great to have some progress on these problems, though....
>>>>
>>>> --b.
>>>>
>>>>
>>> Hi.
>>>
>>> I have attached a new testcase which exhibits this particular
>>> situation. One script writes out 6 ~1GB files in parallel,
>>> while the other script is simultaneously running an "ls -l"
>>> in the directory.
>>>
>>> When run on a system large enough to store all ~6GB of data,
>>> the dd processes basically write(2) all of their data into
>>> memory very quickly and then spend most of their time in the
>>> close(2) system call flushing the page cache due to the close
>>> to open processing.
>>>
>>> The current flow control support in the NFS client does not work
>>> well for this situation. It was designed to catch the process
>>> filling memory and to block it while the page cache flush is
>>> being done by the process doing the stat(2).
>>>
>>> The problem with this approach is that there could potentially be
>>> gigabytes of page cache which needs to be flushed to the server
>>> during the stat(2) processing. This blocks the application
>>> doing the stat(2) for potentially a very long time, based on the
>>> amount of data which was cached, the speed of the network, and
>>> the speed of the server.
>>>
>>> The solution is to limit the amount of data that must be flushed
>>> during the stat(2) call. This can be done by starting i/o when
>>> the application has filled enough pages to fill an entire wsize'd
>>> transfer and by limiting the number of these transfers which are
>>> outstanding so as not to overwhelm the server.
>>>
>>> -----------
>>>
>>> While it seems that it would be good to have this done by the
>>> VM itself, the current architecture of the VM does not seem to
>>> yield itself easily to doing this. It seems like doing something
>>> like a per-file bdi would do the trick, however the system is
>>> not scalable to the number of bdi's that that would require.
>>>
>>> I am open to suggestions for alternate solutions, but in the
>>> meantime, this support does seem to address the situation. In
>>> my test environment, it also increases, significantly,
>>> performance when sequentially writing large files. My throughput
>>> when dd'ing /dev/sda1 to an NFS mounted file went from ~22MB/s
>>> to ~38MB/s. (I do this for image backups for my laptop.) Your
>>> mileage may vary however. :-)
>>>
>>> So, we can consider taking this so that we can address some
>>> customer needs?
>>>
>>
>>
>> In the above mail, you are justifying the patch out of concern for
>> stat() behaviour, but (unless I'm looking at an outdated version)
>> that
>> is clearly not what has driven the design.
>> For instance, the call to nfs_wait_for_outstanding_writes() seems
>> to be
>> unnecessary to fix the issue of flow control in stat() to which you
>> refer above, and is likely to be detrimental to write() performance.
>> Also, you have the nfs_is_serial() heuristic, which turns it all
>> off in
>> the random writeback case. Again, that seems to have little to do
>> with
>> fixing stat().
>> I realise that your main motivation is to address the needs of the
>> customer in question, but I'm still not convinced that this is the
>> right
>> way to do it.
>>
>>
>
> Actually, I was able to solve the stat() problem as a side
> effect of the original design, but it seemed like additional
> reasons for wanting this code integrated.
>
> Yes, part of the architecture is to smooth the WRITE traffic
> and to keep from overwhelming the server. This is what the
> nfs_wait_for_outstanding_writes() does.
>
> I could update the changelog to mention that this support is
> disabled if the NFS client detects random access to the file.
> I added that so that applications such as databases wouldn't
> be harmed. I guess that I just took that sort of thing for
> granted and didn't think about it much further.
>
>> To address the actual issue of WRITE request reordering, do we know
>> why
>> the NFS client is generating out of order RPCs? Is it just reordering
>> within the RPC layer, or is it something else? For instance, I seem
>> to
>> recollect that Chris Mason mentioned WB_SYNC_NONE, as being a major
>> source of non-linearity when he looked at btrfs. I can imagine that
>> when
>> you combine that with the use of the 'range_cyclic' flag in
>> writeback_control, then you will get all sorts of "interesting"
>> request
>> orders...
>
> This version of the support does not address WRITE request
> reordering. The other changes to system plus the NFS_INO_FLUSHING
> support that you added seems to address this in as much as I
> don't see out of order WRITE requests anymore.
>
> -----
>
> I am trying to do accomplish two things here. The first thing
> was to smooth the WRITE traffic so that the client would perform
> better. Caching a few gigabytes of data and then flushing it to
> the server using a firehose doesn't seem to work very well. In
> a customer situation, I really had a server which could not keep
> up with the client. Something was needed to better match the
> client and server bandwidths.
>
> Second, I noticed that the architecture to smooth the WRITE
> traffic and do the flow control could be used very nicely to
> solve the stat() problem too. The smoothing of the WRITE
> traffic results in fewer dirty cached pages which need to get
> flushed to the server during the stat() processing. This helps
> to reduce the latency of the stat() call. Next, the flow control
> aspect can be used to block the application which is writing to
> the file while the application. It happens without adding any
> more code to the writing path.
>
> I have spent quite a bit of time trying to measure the performance
> impact. As far as I can see, it varies from significantly better
> to no affect. Some things like dd run much better in my test
> network. Other things like rpmbuild don't appear to be affected.
> Compilations tend to be random access to files and are generally
> more cpu limited than i/o bound.
>
> -----
>
> I'd be happy to chat about any other ideas for ways to solve
> the issues that I need to solve. At the moment, there is a
> customer who is quite interested in getting the stat() problem
> resolved. (He may come to your attention from another
> direction as well.) We've given him a workaround, which may
> end up being his solution, but that workaround won't work for
> all of the rest of the people who have complained about the
> stat() problem.

"Me too"

We had an internal customer last year (which I believe I consulted you
about) with the same stat(2) problem. I think my workaround is
actually upstream, but the customer was not satisfied with just the
workaround and the answer "we need to fix the VM to make the problem
go away completely".

We were able to shorten the wait in stat(2) with the workaround and by
adjusting VM sysctls. It's still too long, though.

> Adding the locking for i_mutex around the
> page flushing did help to ensure that the stat() processing
> eventually succeeds, but left some problems with very large
> latencies while waiting for large page caches to flush. Even
> on my little 4GB system, those latencies can be 10s of seconds
> or more. This is not generally acceptable by our users.
>
> Thanx...
>
> ps
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs"
> in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com




2009-06-09 22:32:27

by Peter Staubach

[permalink] [raw]
Subject: Re: [PATCH v2] flow control for WRITE requests

Peter Staubach wrote:
> Trond Myklebust wrote:
>
>> On Tue, 2009-06-02 at 14:37 -0400, Peter Staubach wrote:
>>
>>
>>> Trond Myklebust wrote:
>>>
>>>
>>>> So, how about doing this by modifying balance_dirty_pages() instead?
>>>> Limiting pages on a per-inode basis isn't going to solve the common
>>>> problem of 'ls -l' performance, where you have to stat a whole bunch of
>>>> files, all of which may be dirty. To deal with that case, you really
>>>> need an absolute limit on the number of dirty pages.
>>>>
>>>> Currently, we have only relative limits: a given bdi is allowed a
>>>> maximum percentage value of the total write back cache size... We could
>>>> add a 'max_pages' field, that specifies an absolute limit at which the
>>>> vfs should start writeback.
>>>>
>>>>
>>> Interesting thought. From a high level, it sounds like a good
>>> strategy. The details start to get a little troubling to me
>>> though.
>>>
>>> First thing that strikes me is that this may result in
>>> suboptimal WRITE requests being issued over the wire. If the
>>> page quota is filled with many pages from one file and just a
>>> few from another due to timing, we may end up issuing small
>>> over the wire WRITE requests for the one file, even during
>>> normal operations.
>>>
>>>
>> balance_dirty_pages() will currently call writeback_inodes() to actually
>> flush out the pages. The latter will again check the super block dirty
>> list to determine candidate files; it doesn't favour the particular file
>> on which we called balance_dirty_pages_ratelimited().
>>
>>
>>
>
> It doesn't favor any files. It runs on all of them. Without
> some more clever smarts, we end up with small over the wire
> writes, which are to be avoided if at all possible.
>
>
>> That said, balance_dirty_pages_ratelimited() does take the mapping as an
>> argument. You could, therefore, in theory have it make decisions on a
>> per-mapping basis.
>>
>>
>>
>
> I will have to think about this more. Could you elaborate on
> what you were thinking that we might be able to do?
>
>
>>> We don't want to flush pages in the page cache until an entire
>>> wsize'd transfer can be constructed for the specific file.
>>> Thus, it seems to me that we still need to track the number of
>>> dirty pages per file.
>>>
>>> We also need to know that those pages are contiguous in the
>>> file. We can determine, heuristically, whether the pages are
>>> contiguous in the file or not by tracking the access pattern.
>>> For random access, we can assume that the pages are not
>>> contiguous and we can assume that they are contiguous for
>>> sequential access. This isn't perfect and can be fooled,
>>> but should hold for most applications which access files
>>> sequentially.
>>>
>>> Also, we don't want to proactively flush the cache if the
>>> application is doing random access. The application may come
>>> back to the page and we could get away with a single WRITE
>>> instead of multiple WRITE requests for the same page. With
>>> sequential access, we can generally know that it is safe to
>>> proactively flush pages because the application won't be
>>> accessing them again. Once again, this heuristic is not
>>> foolproof, but holds most of the time.
>>>
>>>
>> I'm not sure I follow you here. Why is the random access case any
>> different to the sequential access case? Random writes are obviously a
>> pain to deal with since you cannot predict access patterns. However,
>> AFAICS if we want to provide a faster generic stat(), then we need to
>> deal with random writes too: a gigabyte of data will take even longer to
>> flush out when it is in the form of non-contiguous writes.
>>
>>
>>
>
> I think that access patterns are important because we can't solve
> the ls performance problem at the expense of ruining all other
> performance. During normal operations, ie. without ls running in
> the directory, performance should as close to what exists today
> as possible, or even better. I think that folks running a
> database in a file would probably not be happy with a tradeoff
> that makes ls run on the database files run faster while making
> the applications which update the database run slower. We have
> been busy trying to convince people to run databases on top of
> file systems instead of raw partitions and this would hurt.
>
> It would be nice to provide a faster generic stat(). However,
> I don't easily see how to do this and it not clear that we
> actually have to do this. We do need a faster stat() on files
> that are being sequentially written to. We have customer
> bugzillas and reports on this already. The people who tend
> to run applications which use random access on files tend
> to be those who care more about the performance of those
> applications than someone running ls.
>
>
>
>>> For the ls case, we really want to manage the page cache on a
>>> per-directory of files case. I don't think that this is going
>>> to happen. The only directions to go from there are more
>>> coarse, per-bdi, or less coarse, per-file.
>>>
>>>
>> Ugh. No...
>>
>>
>>
>
> Ugh, indeed. :-)
>
>
>>> If we go the per-bdi approach, then we would need to stop
>>> all modifications to the page cache for that particular bdi
>>> during the duration of the ls processing. Otherwise, as we
>>> stat 1 file at a time, the other files still needing to be
>>> stat'd would just refill the page cache with dirty pages.
>>> We could solve this by setting the max_pages limit to be a
>>> reasonable number to flush per file, but then that would be
>>> too small a limit for the entire file system.
>>>
>>>
>> True, but if you have applications writing to all the files in your
>> directory, then 'ls -l' performance is likely to suck anyway. Even if
>> you do have per-file limits, those write-backs to the other files will
>> be competing for RPC slots with the write-backs from the file that is
>> being stat()ed.
>>
>>
>>
>
> There is going to be a cost to running ls in a directory which
> contains files which are being actively written to. I don't
> see how to avoid this, given the architecture of maintaining
> file times by the server and the semantics required. We can
> strive to limit the cost though. I think that we can limit
> the cost without affecting normal operations.
>
>
>>> So, I don't see how to get around managing the page cache on
>>> a per-file basis, at least to some extent, in order to manage
>>> the amount of dirty data that must be flushed.
>>>
>>> It does seem like the right way to do this is via a combination
>>> of per-bdi and per-file support, but I am not sure that we have
>>> the right information at the right levels to achieve this now.
>>>
>>> Thanx...
>>>
>>> ps
>>>
>>>
>> In the long run, I'd like to see us merge something like the fstatat()
>> patches that were hacked together at the LSF'09 conference.
>> If applications can actually tell the NFS client that they don't care
>> about a/c/mtime accuracy, then we can avoid this whole flushing nonsense
>> altogether. It would suffice to teach 'ls' to start using the
>> AT_NO_TIMES flag that we defined...
>>
>
> Is someone pursuing those patches which were hacked together
> at LSF'09? Or, is there a specification or some sort of design
> document for the work?
>
> This would help in some cases. Using ls without any arguments
> that require file times could certainly be made to run lickety
> split. However, we would still be stuck with the crowd that
> needed to know file sizes or file times for their own nefarious
> reasons. It would be easy to dismiss these folks, until one
> becomes a paying support customer with enough clout to demand
> that something be done. I, myself, even, fall into this
> situation often enough that I wish for the situation to be
> addressed.
>
> Thanx...
>
> ps

Hi.

I still need to move this along.

The proposed patch does make things better and as far as I
can tell from my monitoring, does not seem to make things
worse from the user's perspective. It does restore some
of the support and complexity that was previously removed,
but I don't see how to solve the problem with it.

What'd you think?

Thanx...

ps

2009-06-09 23:05:23

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v2] flow control for WRITE requests

On Tue, 2009-06-09 at 18:32 -0400, Peter Staubach wrote:
> I still need to move this along.

Sorry, it has been a long week at home (state championships,
graduation...).

I did promise to send a dump of the state of the fstatat() stuff from
LSF (see attachments).

As for the patch you posted, I did have comments that haven't really
been addressed. As I said, I certainly don't see the need to have
write() wait for writebacks to complete. I also don't accept that we
need to treat random writes as fundamentally different from serial
writes.
I'm currently inclining towards adding a switch to turn off strict posix
behaviour. There weren't too many people asking for it earlier, and
there aren't that many applications out there that are sensitive to the
exact mtime. Samba and backup applications are the major exceptions to
that rule, but you don't really run those on top of NFS clients if you
can avoid it...

Cheers
Trond


Attachments:
1 (18.81 kB)
2 (15.71 kB)
3 (1.91 kB)
Download all attachments

2009-06-10 19:43:48

by Peter Staubach

[permalink] [raw]
Subject: Re: [PATCH v2] flow control for WRITE requests

Trond Myklebust wrote:
> On Tue, 2009-06-09 at 18:32 -0400, Peter Staubach wrote:
>
>> I still need to move this along.
>>
>
> Sorry, it has been a long week at home (state championships,
> graduation...).
>
>

State championships? How did they go?

> I did promise to send a dump of the state of the fstatat() stuff from
> LSF (see attachments).
>
>

Thanx! Seems fairly straightforward.

> As for the patch you posted, I did have comments that haven't really
> been addressed. As I said, I certainly don't see the need to have
> write() wait for writebacks to complete. I also don't accept that we
> need to treat random writes as fundamentally different from serial
> writes.
>

Sorry about not addressing your comments adequately.

Are you refering to nfs_wait_for_outstanding_writes() or
do you see someplace else that write() is waiting for
writebacks to complete? Perhaps I should have named it
nfs_wait_for_too_many_outstanding_writes()? :-)

That certainly was not the intention. The intention was to
have the pages gathered and then the over the wire stuff
handled asynchronously. If this is not true, then I need to
do some more work.

A goal of this work is attempt to better match the bandwidth
offered by the network/server/storage with the rate at which
applications can create dirty pages. It is not good for the
application to get too far ahead and too many pages dirtied.
This leads to the current problem with stat() as well as
much nastier out of memory conditions. If the system is not
capable of cleaning more than N GB/second, then it doesn't
make sense for applications to dirty more than that same
N GB/second. In the end, they won't be able to do that
anyway, so why tie up the memory, possibly causing problems?

I see random access as being different than sequential
mostly due to the expectations that the different style
applications have. Applications which access a file
sequentially typically do not expect to access the pages
again after either reading them or writing them. This does
not mean that we should toss them from the page cache, but
it does mean that we can start writing them because the
chances of the application returning to update the contents
of the pages is minimized and the pages will need to get
written anyway.

Applications that use random access patterns are much more
likely to return to existing pages and modify them for a
second time. Proactively writing these pages means that
multiple over the wire writes would be required when fewer
over the wire writes would have actually been required by
waiting.

> I'm currently inclining towards adding a switch to turn off strict posix
> behaviour. There weren't too many people asking for it earlier, and
> there aren't that many applications out there that are sensitive to the
> exact mtime. Samba and backup applications are the major exceptions to
> that rule, but you don't really run those on top of NFS clients if you
> can avoid it...

While I think that this switch is an okay idea and will help
some applications which get modified to use it, it does not
help existing applications or applications which want the
correct time values and also reasonable performance.

I believe that we can help all applications by reviewing the
page cache handling architecture for the NFS client.

Thanx...

ps

2009-06-01 21:48:11

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v2] flow control for WRITE requests

On Thu, 2009-05-28 at 11:41 -0400, Peter Staubach wrote:
> -----
>
> I am trying to do accomplish two things here. The first thing
> was to smooth the WRITE traffic so that the client would perform
> better. Caching a few gigabytes of data and then flushing it to
> the server using a firehose doesn't seem to work very well. In
> a customer situation, I really had a server which could not keep
> up with the client. Something was needed to better match the
> client and server bandwidths.
>
> Second, I noticed that the architecture to smooth the WRITE
> traffic and do the flow control could be used very nicely to
> solve the stat() problem too. The smoothing of the WRITE
> traffic results in fewer dirty cached pages which need to get
> flushed to the server during the stat() processing. This helps
> to reduce the latency of the stat() call. Next, the flow control
> aspect can be used to block the application which is writing to
> the file while the application. It happens without adding any
> more code to the writing path.
>
> I have spent quite a bit of time trying to measure the performance
> impact. As far as I can see, it varies from significantly better
> to no affect. Some things like dd run much better in my test
> network. Other things like rpmbuild don't appear to be affected.
> Compilations tend to be random access to files and are generally
> more cpu limited than i/o bound.

So, how about doing this by modifying balance_dirty_pages() instead?
Limiting pages on a per-inode basis isn't going to solve the common
problem of 'ls -l' performance, where you have to stat a whole bunch of
files, all of which may be dirty. To deal with that case, you really
need an absolute limit on the number of dirty pages.

Currently, we have only relative limits: a given bdi is allowed a
maximum percentage value of the total write back cache size... We could
add a 'max_pages' field, that specifies an absolute limit at which the
vfs should start writeback.

Cheers
Trond


2009-06-02 18:37:32

by Peter Staubach

[permalink] [raw]
Subject: Re: [PATCH v2] flow control for WRITE requests

Trond Myklebust wrote:
>
> So, how about doing this by modifying balance_dirty_pages() instead?
> Limiting pages on a per-inode basis isn't going to solve the common
> problem of 'ls -l' performance, where you have to stat a whole bunch of
> files, all of which may be dirty. To deal with that case, you really
> need an absolute limit on the number of dirty pages.
>
> Currently, we have only relative limits: a given bdi is allowed a
> maximum percentage value of the total write back cache size... We could
> add a 'max_pages' field, that specifies an absolute limit at which the
> vfs should start writeback.

Interesting thought. From a high level, it sounds like a good
strategy. The details start to get a little troubling to me
though.

First thing that strikes me is that this may result in
suboptimal WRITE requests being issued over the wire. If the
page quota is filled with many pages from one file and just a
few from another due to timing, we may end up issuing small
over the wire WRITE requests for the one file, even during
normal operations.

We don't want to flush pages in the page cache until an entire
wsize'd transfer can be constructed for the specific file.
Thus, it seems to me that we still need to track the number of
dirty pages per file.

We also need to know that those pages are contiguous in the
file. We can determine, heuristically, whether the pages are
contiguous in the file or not by tracking the access pattern.
For random access, we can assume that the pages are not
contiguous and we can assume that they are contiguous for
sequential access. This isn't perfect and can be fooled,
but should hold for most applications which access files
sequentially.

Also, we don't want to proactively flush the cache if the
application is doing random access. The application may come
back to the page and we could get away with a single WRITE
instead of multiple WRITE requests for the same page. With
sequential access, we can generally know that it is safe to
proactively flush pages because the application won't be
accessing them again. Once again, this heuristic is not
foolproof, but holds most of the time.

For the ls case, we really want to manage the page cache on a
per-directory of files case. I don't think that this is going
to happen. The only directions to go from there are more
coarse, per-bdi, or less coarse, per-file.

If we go the per-bdi approach, then we would need to stop
all modifications to the page cache for that particular bdi
during the duration of the ls processing. Otherwise, as we
stat 1 file at a time, the other files still needing to be
stat'd would just refill the page cache with dirty pages.
We could solve this by setting the max_pages limit to be a
reasonable number to flush per file, but then that would be
too small a limit for the entire file system.

So, I don't see how to get around managing the page cache on
a per-file basis, at least to some extent, in order to manage
the amount of dirty data that must be flushed.

It does seem like the right way to do this is via a combination
of per-bdi and per-file support, but I am not sure that we have
the right information at the right levels to achieve this now.

Thanx...

ps

2009-06-02 22:12:19

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v2] flow control for WRITE requests

On Tue, 2009-06-02 at 14:37 -0400, Peter Staubach wrote:
> Trond Myklebust wrote:
> >
> > So, how about doing this by modifying balance_dirty_pages() instead?
> > Limiting pages on a per-inode basis isn't going to solve the common
> > problem of 'ls -l' performance, where you have to stat a whole bunch of
> > files, all of which may be dirty. To deal with that case, you really
> > need an absolute limit on the number of dirty pages.
> >
> > Currently, we have only relative limits: a given bdi is allowed a
> > maximum percentage value of the total write back cache size... We could
> > add a 'max_pages' field, that specifies an absolute limit at which the
> > vfs should start writeback.
>
> Interesting thought. From a high level, it sounds like a good
> strategy. The details start to get a little troubling to me
> though.
>
> First thing that strikes me is that this may result in
> suboptimal WRITE requests being issued over the wire. If the
> page quota is filled with many pages from one file and just a
> few from another due to timing, we may end up issuing small
> over the wire WRITE requests for the one file, even during
> normal operations.

balance_dirty_pages() will currently call writeback_inodes() to actually
flush out the pages. The latter will again check the super block dirty
list to determine candidate files; it doesn't favour the particular file
on which we called balance_dirty_pages_ratelimited().

That said, balance_dirty_pages_ratelimited() does take the mapping as an
argument. You could, therefore, in theory have it make decisions on a
per-mapping basis.

> We don't want to flush pages in the page cache until an entire
> wsize'd transfer can be constructed for the specific file.
> Thus, it seems to me that we still need to track the number of
> dirty pages per file.
>
> We also need to know that those pages are contiguous in the
> file. We can determine, heuristically, whether the pages are
> contiguous in the file or not by tracking the access pattern.
> For random access, we can assume that the pages are not
> contiguous and we can assume that they are contiguous for
> sequential access. This isn't perfect and can be fooled,
> but should hold for most applications which access files
> sequentially.
>
> Also, we don't want to proactively flush the cache if the
> application is doing random access. The application may come
> back to the page and we could get away with a single WRITE
> instead of multiple WRITE requests for the same page. With
> sequential access, we can generally know that it is safe to
> proactively flush pages because the application won't be
> accessing them again. Once again, this heuristic is not
> foolproof, but holds most of the time.

I'm not sure I follow you here. Why is the random access case any
different to the sequential access case? Random writes are obviously a
pain to deal with since you cannot predict access patterns. However,
AFAICS if we want to provide a faster generic stat(), then we need to
deal with random writes too: a gigabyte of data will take even longer to
flush out when it is in the form of non-contiguous writes.

> For the ls case, we really want to manage the page cache on a
> per-directory of files case. I don't think that this is going
> to happen. The only directions to go from there are more
> coarse, per-bdi, or less coarse, per-file.

Ugh. No...

> If we go the per-bdi approach, then we would need to stop
> all modifications to the page cache for that particular bdi
> during the duration of the ls processing. Otherwise, as we
> stat 1 file at a time, the other files still needing to be
> stat'd would just refill the page cache with dirty pages.
> We could solve this by setting the max_pages limit to be a
> reasonable number to flush per file, but then that would be
> too small a limit for the entire file system.

True, but if you have applications writing to all the files in your
directory, then 'ls -l' performance is likely to suck anyway. Even if
you do have per-file limits, those write-backs to the other files will
be competing for RPC slots with the write-backs from the file that is
being stat()ed.

> So, I don't see how to get around managing the page cache on
> a per-file basis, at least to some extent, in order to manage
> the amount of dirty data that must be flushed.
>
> It does seem like the right way to do this is via a combination
> of per-bdi and per-file support, but I am not sure that we have
> the right information at the right levels to achieve this now.
>
> Thanx...
>
> ps

In the long run, I'd like to see us merge something like the fstatat()
patches that were hacked together at the LSF'09 conference.
If applications can actually tell the NFS client that they don't care
about a/c/mtime accuracy, then we can avoid this whole flushing nonsense
altogether. It would suffice to teach 'ls' to start using the
AT_NO_TIMES flag that we defined...

Cheers
Trond


2009-06-03 14:17:48

by Peter Staubach

[permalink] [raw]
Subject: Re: [PATCH v2] flow control for WRITE requests

Trond Myklebust wrote:
> On Tue, 2009-06-02 at 14:37 -0400, Peter Staubach wrote:
>
>> Trond Myklebust wrote:
>>
>>> So, how about doing this by modifying balance_dirty_pages() instead?
>>> Limiting pages on a per-inode basis isn't going to solve the common
>>> problem of 'ls -l' performance, where you have to stat a whole bunch of
>>> files, all of which may be dirty. To deal with that case, you really
>>> need an absolute limit on the number of dirty pages.
>>>
>>> Currently, we have only relative limits: a given bdi is allowed a
>>> maximum percentage value of the total write back cache size... We could
>>> add a 'max_pages' field, that specifies an absolute limit at which the
>>> vfs should start writeback.
>>>
>> Interesting thought. From a high level, it sounds like a good
>> strategy. The details start to get a little troubling to me
>> though.
>>
>> First thing that strikes me is that this may result in
>> suboptimal WRITE requests being issued over the wire. If the
>> page quota is filled with many pages from one file and just a
>> few from another due to timing, we may end up issuing small
>> over the wire WRITE requests for the one file, even during
>> normal operations.
>>
>
> balance_dirty_pages() will currently call writeback_inodes() to actually
> flush out the pages. The latter will again check the super block dirty
> list to determine candidate files; it doesn't favour the particular file
> on which we called balance_dirty_pages_ratelimited().
>
>

It doesn't favor any files. It runs on all of them. Without
some more clever smarts, we end up with small over the wire
writes, which are to be avoided if at all possible.

> That said, balance_dirty_pages_ratelimited() does take the mapping as an
> argument. You could, therefore, in theory have it make decisions on a
> per-mapping basis.
>
>

I will have to think about this more. Could you elaborate on
what you were thinking that we might be able to do?

>> We don't want to flush pages in the page cache until an entire
>> wsize'd transfer can be constructed for the specific file.
>> Thus, it seems to me that we still need to track the number of
>> dirty pages per file.
>>
>> We also need to know that those pages are contiguous in the
>> file. We can determine, heuristically, whether the pages are
>> contiguous in the file or not by tracking the access pattern.
>> For random access, we can assume that the pages are not
>> contiguous and we can assume that they are contiguous for
>> sequential access. This isn't perfect and can be fooled,
>> but should hold for most applications which access files
>> sequentially.
>>
>> Also, we don't want to proactively flush the cache if the
>> application is doing random access. The application may come
>> back to the page and we could get away with a single WRITE
>> instead of multiple WRITE requests for the same page. With
>> sequential access, we can generally know that it is safe to
>> proactively flush pages because the application won't be
>> accessing them again. Once again, this heuristic is not
>> foolproof, but holds most of the time.
>>
>
> I'm not sure I follow you here. Why is the random access case any
> different to the sequential access case? Random writes are obviously a
> pain to deal with since you cannot predict access patterns. However,
> AFAICS if we want to provide a faster generic stat(), then we need to
> deal with random writes too: a gigabyte of data will take even longer to
> flush out when it is in the form of non-contiguous writes.
>
>

I think that access patterns are important because we can't solve
the ls performance problem at the expense of ruining all other
performance. During normal operations, ie. without ls running in
the directory, performance should as close to what exists today
as possible, or even better. I think that folks running a
database in a file would probably not be happy with a tradeoff
that makes ls run on the database files run faster while making
the applications which update the database run slower. We have
been busy trying to convince people to run databases on top of
file systems instead of raw partitions and this would hurt.

It would be nice to provide a faster generic stat(). However,
I don't easily see how to do this and it not clear that we
actually have to do this. We do need a faster stat() on files
that are being sequentially written to. We have customer
bugzillas and reports on this already. The people who tend
to run applications which use random access on files tend
to be those who care more about the performance of those
applications than someone running ls.


>> For the ls case, we really want to manage the page cache on a
>> per-directory of files case. I don't think that this is going
>> to happen. The only directions to go from there are more
>> coarse, per-bdi, or less coarse, per-file.
>>
>
> Ugh. No...
>
>

Ugh, indeed. :-)

>> If we go the per-bdi approach, then we would need to stop
>> all modifications to the page cache for that particular bdi
>> during the duration of the ls processing. Otherwise, as we
>> stat 1 file at a time, the other files still needing to be
>> stat'd would just refill the page cache with dirty pages.
>> We could solve this by setting the max_pages limit to be a
>> reasonable number to flush per file, but then that would be
>> too small a limit for the entire file system.
>>
>
> True, but if you have applications writing to all the files in your
> directory, then 'ls -l' performance is likely to suck anyway. Even if
> you do have per-file limits, those write-backs to the other files will
> be competing for RPC slots with the write-backs from the file that is
> being stat()ed.
>
>

There is going to be a cost to running ls in a directory which
contains files which are being actively written to. I don't
see how to avoid this, given the architecture of maintaining
file times by the server and the semantics required. We can
strive to limit the cost though. I think that we can limit
the cost without affecting normal operations.

>> So, I don't see how to get around managing the page cache on
>> a per-file basis, at least to some extent, in order to manage
>> the amount of dirty data that must be flushed.
>>
>> It does seem like the right way to do this is via a combination
>> of per-bdi and per-file support, but I am not sure that we have
>> the right information at the right levels to achieve this now.
>>
>> Thanx...
>>
>> ps
>>
>
> In the long run, I'd like to see us merge something like the fstatat()
> patches that were hacked together at the LSF'09 conference.
> If applications can actually tell the NFS client that they don't care
> about a/c/mtime accuracy, then we can avoid this whole flushing nonsense
> altogether. It would suffice to teach 'ls' to start using the
> AT_NO_TIMES flag that we defined...

Is someone pursuing those patches which were hacked together
at LSF'09? Or, is there a specification or some sort of design
document for the work?

This would help in some cases. Using ls without any arguments
that require file times could certainly be made to run lickety
split. However, we would still be stuck with the crowd that
needed to know file sizes or file times for their own nefarious
reasons. It would be easy to dismiss these folks, until one
becomes a paying support customer with enough clout to demand
that something be done. I, myself, even, fall into this
situation often enough that I wish for the situation to be
addressed.

Thanx...

ps

2009-07-06 00:48:07

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH v2] flow control for WRITE requests


>
> I believe that we can help all applications by reviewing the
> page cache handling architecture for the NFS client.

(coming in on this a bit late ... we now have a customer hitting the
'stat' problem too :-( )

I think the problem is bigger than just an NFS problem, I think it
would be best if we could solve it in a system wide manner. That
might be hard, and there might still be a case for some NFS specific
fix, but I'll try to explain how I see it and where I think the fix
should be.

As has been noted, the VM throttles writes when the amount if dirty
page-cache memory crosses some threshold - this is handled in
balance_dirty_pages.

The threshold can be set either with vm_dirty_ratio (20% of available
memory by default) or vm_dirty_bytes (which is useful when you want to
set a ratio below 1% due to the size of memory).

I think it would be very useful if this could be set in 'seconds'
rather than 'bytes'. i.e. writeout should be throttled when the
amount of dirty memory is more than can be written out in N seconds -
e.g. '3'.
Implementing this would not be straight forward, but I think it should
be possible.
One approach might be for balance_dirty_pages to count how long it
took to write out it's allocation of pages and merge this into a
per-bdi floating 'writeout rate' number. From this number we can
calculate a maximum number of dirty pages that are allowed for that
bdi, based on the maximum number of seconds we want writeout to take.

Doing this would cause 'sync' and similar functions to take a
controlled amount of time instead of time proportion to available
memory which on tens-of-gigabyte-machines can be an awfully long
time ... we had a different customer with 32Gig of RAM a 70MB/sec disk
drives. That is 90 seconds to sync - best case.

I think this would address the 'stat' problem by limiting the time a
sequence of stats can take to approximately the configured 'sync'
time.
I don't think we need to worry about limiting each file.
If we are writing out to, say, 10 files in one directory, then each
will, on average, have 1/10 of the allowed number of pages. So
syncing each file in turn for 'stat' should take about 10 times that,
or the total allowed time.
It could get a bit worse than that: while one file is flushing, the
other files could each grow to 1/9 of the allowed number of
pages. But 10 lots of 1/9 is still not much more than 1.
I think the worst case would be if two files in a directory were being
written to. They might both be using 1/2 of the allowed number of
pages.
If we flush one, the other can grow to use all the allowed pages which
we must then flush. This would make the total time to stat both files
about 1.5 time the configured time, which isn't a bad worst case
(actually I think you can get worse cases up to 2x if the dirty pages
in the files aren't balanced, but that is still a controlled number).


This might not address your problems with your interesting NFS server
with it's limited buffer.
Maybe for that you want a per-bdi 'dirty_bytes' configurable. That
should be very easy to implement and test as you could easily make
such a configurable that could be accessed via sysfs, and
get_dirty_limits takes a bdi, so clipping the bdi_dirty value to this
configurable would be quite straight forward.

NeilBrown