2009-01-23 19:10:42

by Peter Staubach

[permalink] [raw]
Subject: [PATCH] flow control for WRITE requests

--- linux-2.6.28.i686/fs/nfs/inode.c.org
+++ linux-2.6.28.i686/fs/nfs/inode.c
@@ -486,8 +486,10 @@ void nfs_setattr_update_inode(struct ino
int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
- int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ int need_atime = nfsi->cache_validity & NFS_INO_INVALID_ATIME;
int err;
+ int inc_outstanding_writes = nfs_max_outstanding_writes;

/*
* Flush out writes to the server in order to update c/mtime.
@@ -497,9 +499,8 @@ int nfs_getattr(struct vfsmount *mnt, st
* nfs_wb_nocommit.
*/
if (S_ISREG(inode->i_mode)) {
- mutex_lock(&inode->i_mutex);
+ atomic_add(inc_outstanding_writes, &nfsi->writes);
nfs_wb_nocommit(inode);
- mutex_unlock(&inode->i_mutex);
}

/*
@@ -523,6 +524,10 @@ int nfs_getattr(struct vfsmount *mnt, st
generic_fillattr(inode, stat);
stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
}
+ if (S_ISREG(inode->i_mode)) {
+ atomic_sub(inc_outstanding_writes, &nfsi->writes);
+ wake_up(&nfsi->writes_wq);
+ }
return err;
}

@@ -1288,9 +1293,13 @@ static void init_once(void *foo)
INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
nfsi->ncommit = 0;
nfsi->npages = 0;
+ atomic_set(&nfsi->ndirty, 0);
atomic_set(&nfsi->silly_count, 1);
INIT_HLIST_HEAD(&nfsi->silly_list);
init_waitqueue_head(&nfsi->waitqueue);
+ atomic_set(&nfsi->writes, 0);
+ init_waitqueue_head(&nfsi->writes_wq);
+ nfsi->wrpos = 0;
nfs4_init_once(nfsi);
}

--- linux-2.6.28.i686/fs/nfs/write.c.org
+++ linux-2.6.28.i686/fs/nfs/write.c
@@ -197,7 +197,9 @@ static int nfs_set_page_writeback(struct
if (!ret) {
struct inode *inode = page->mapping->host;
struct nfs_server *nfss = NFS_SERVER(inode);
+ struct nfs_inode *nfsi = NFS_I(inode);

+ atomic_dec(&nfsi->ndirty);
if (atomic_long_inc_return(&nfss->writeback) >
NFS_CONGESTION_ON_THRESH)
set_bdi_congested(&nfss->backing_dev_info, WRITE);
@@ -310,6 +312,33 @@ static int nfs_writepages_callback(struc
return ret;
}

+int nfs_max_outstanding_writes = 16;
+
+static void nfs_inc_outstanding_writes(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ atomic_inc(&nfsi->writes);
+}
+
+static void nfs_dec_outstanding_writes(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ if (atomic_dec_return(&nfsi->writes) < nfs_max_outstanding_writes)
+ wake_up(&nfsi->writes_wq);
+}
+
+void nfs_wait_for_outstanding_writes(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ if (nfs_max_outstanding_writes) {
+ wait_event(nfsi->writes_wq,
+ atomic_read(&nfsi->writes) < nfs_max_outstanding_writes);
+ }
+}
+
int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
@@ -369,6 +398,7 @@ static int nfs_inode_add_request(struct
SetPagePrivate(req->wb_page);
set_page_private(req->wb_page, (unsigned long)req);
nfsi->npages++;
+ atomic_inc(&nfsi->ndirty);
kref_get(&req->wb_kref);
radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
NFS_PAGE_TAG_LOCKED);
@@ -405,6 +435,10 @@ static void nfs_inode_remove_request(str
static void
nfs_mark_request_dirty(struct nfs_page *req)
{
+ struct inode *inode = req->wb_context->path.dentry->d_inode;
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ atomic_inc(&nfsi->ndirty);
__set_page_dirty_nobuffers(req->wb_page);
}

@@ -633,6 +667,7 @@ static struct nfs_page *nfs_try_to_updat
req->wb_bytes = end - req->wb_offset;
else
req->wb_bytes = rqend - req->wb_offset;
+ atomic_inc(&NFS_I(inode)->ndirty);
out_unlock:
spin_unlock(&inode->i_lock);
return req;
@@ -855,6 +890,8 @@ static int nfs_write_rpcsetup(struct nfs
count,
(unsigned long long)data->args.offset);

+ nfs_inc_outstanding_writes(inode);
+
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
@@ -1130,7 +1167,7 @@ int nfs_writeback_done(struct rpc_task *
*/
status = NFS_PROTO(data->inode)->write_done(task, data);
if (status != 0)
- return status;
+ goto out;
nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);

#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -1186,7 +1223,9 @@ int nfs_writeback_done(struct rpc_task *
/* Can't do anything about it except throw an error. */
task->tk_status = -EIO;
}
- return 0;
+out:
+ nfs_dec_outstanding_writes(data->inode);
+ return status;
}


@@ -1546,6 +1585,29 @@ int nfs_wb_page(struct inode *inode, str
return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
}

+/*
+ * Start the WRITE requests for dirty pages on their way.
+ * This is used when a sufficient number of dirty pages
+ * have accumulated.
+ */
+int nfs_wb_interim(struct inode *inode)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct writeback_control wbc = {
+ .bdi = mapping->backing_dev_info,
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = LONG_MAX,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ };
+ int ret;
+
+ ret = nfs_writepages(mapping, &wbc);
+ if (ret < 0)
+ __mark_inode_dirty(inode, I_DIRTY_PAGES);
+ return ret;
+}
+
int __init nfs_init_writepagecache(void)
{
nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
--- linux-2.6.28.i686/fs/nfs/sysctl.c.org
+++ linux-2.6.28.i686/fs/nfs/sysctl.c
@@ -58,6 +58,14 @@ static ctl_table nfs_cb_sysctls[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "nfs_max_outstanding_writes",
+ .data = &nfs_max_outstanding_writes,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
{ .ctl_name = 0 }
};

--- linux-2.6.28.i686/fs/nfs/file.c.org
+++ linux-2.6.28.i686/fs/nfs/file.c
@@ -512,11 +512,17 @@ static int nfs_need_sync_write(struct fi
return 0;
}

+static int nfs_is_serial(struct inode *inode, loff_t pos)
+{
+ return NFS_I(inode)->wrpos == pos;
+}
+
static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
- struct dentry * dentry = iocb->ki_filp->f_path.dentry;
- struct inode * inode = dentry->d_inode;
+ struct dentry *dentry = iocb->ki_filp->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+ struct nfs_inode *nfsi = NFS_I(inode);
ssize_t result;
size_t count = iov_length(iov, nr_segs);

@@ -530,6 +536,13 @@ static ssize_t nfs_file_write(struct kio
result = -EBUSY;
if (IS_SWAPFILE(inode))
goto out_swapfile;
+
+ result = count;
+ if (!count)
+ goto out;
+
+ nfs_wait_for_outstanding_writes(inode);
+
/*
* O_APPEND implies that we must revalidate the file length.
*/
@@ -539,17 +552,22 @@ static ssize_t nfs_file_write(struct kio
goto out;
}

- result = count;
- if (!count)
- goto out;
-
nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
result = generic_file_aio_write(iocb, iov, nr_segs, pos);
/* Return error values for O_SYNC and IS_SYNC() */
- if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
- int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
- if (err < 0)
- result = err;
+ if (result >= 0) {
+ if (nfs_need_sync_write(iocb->ki_filp, inode)) {
+ int err;
+ err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp),
+ inode);
+ if (err < 0)
+ result = err;
+ } else if (nfs_max_outstanding_writes &&
+ nfs_is_serial(inode, pos) &&
+ atomic_read(&nfsi->ndirty) >= NFS_SERVER(inode)->wpages)
+ nfs_wb_interim(inode);
+ if (result > 0)
+ nfsi->wrpos = pos + result;
}
out:
return result;
--- linux-2.6.28.i686/include/linux/nfs_fs.h.org
+++ linux-2.6.28.i686/include/linux/nfs_fs.h
@@ -168,6 +168,7 @@ struct nfs_inode {

unsigned long ncommit,
npages;
+ atomic_t ndirty;

/* Open contexts for shared mmap writes */
struct list_head open_files;
@@ -186,6 +187,9 @@ struct nfs_inode {
fmode_t delegation_state;
struct rw_semaphore rwsem;
#endif /* CONFIG_NFS_V4*/
+ atomic_t writes; /* number of outstanding WRITEs */
+ wait_queue_head_t writes_wq;
+ loff_t wrpos; /* position after last WRITE */
struct inode vfs_inode;
};

@@ -459,12 +463,14 @@ extern void nfs_unblock_sillyrename(stru
* linux/fs/nfs/write.c
*/
extern int nfs_congestion_kb;
+extern int nfs_max_outstanding_writes;
extern int nfs_writepage(struct page *page, struct writeback_control *wbc);
extern int nfs_writepages(struct address_space *, struct writeback_control *);
extern int nfs_flush_incompatible(struct file *file, struct page *page);
extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
-extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
+extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
extern void nfs_writedata_release(void *);
+extern void nfs_wait_for_outstanding_writes(struct inode *);

/*
* Try to write back everything synchronously (but check the
@@ -475,6 +481,7 @@ extern int nfs_wb_all(struct inode *inod
extern int nfs_wb_nocommit(struct inode *inode);
extern int nfs_wb_page(struct inode *inode, struct page* page);
extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
+extern int nfs_wb_interim(struct inode *);
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
extern int nfs_commit_inode(struct inode *, int);
extern struct nfs_write_data *nfs_commitdata_alloc(void);


Attachments:
flow_control.devel (9.24 kB)

2009-01-23 20:36:17

by Chuck Lever

[permalink] [raw]
Subject: Re: [PATCH] flow control for WRITE requests

How does this play with memory pressure?

I would expect the generic background page flushers, like pdflush and
kswapd, could still cause out-of-order writes.

On Jan 23, 2009, at Jan 23, 2009, 2:10 PM, Peter Staubach wrote:

> Hi.
>
> Attached is a patch which implements some flow control for the
> NFS client to control dirty pages. The flow control is
> implemented on a per-file basis and causes dirty pages to be
> written out when the client can detect that the application is
> writing in a serial fashion and has dirtied enough pages to
> fill a complete over the wire transfer.
>
> This work was precipitated by working on a situation where a
> server at a customer site was not able to adequately handle
> the behavior of the Linux NFS client. This particular server
> required that all data to the file written to the file be
> written in a strictly serial fashion. It also had problems
> handling the Linux NFS client semantic of caching a large
> amount of data and then sending out that data all at once.
>
> The sequential ordering problem was resolved by a previous
> patch which was submitted to the linux-nfs list. This patch
> addresses the capacity problem.
>
> The problem is resolved by sending WRITE requests much
> earlier in the process of the application writing to the file.
> The client keeps track of the number of dirty pages associated
> with the file and also the last offset of the data being
> written. When the client detects that a full over the wire
> transfer could be constructed and that the application is
> writing sequentially, then it generates an UNSTABLE write to
> server for the currently dirty data.
>
> The client also keeps track of the number of these WRITE
> requests which have been generated. It flow controls based
> on a configurable maximum. This keeps the client from
> completely overwhelming the server.
>
> A nice side effect of the framework is that the issue of
> stat()'ing a file being written can be handled much more
> quickly than before. The amount of data that must be
> transmitted to the server to satisfy the "latest mtime"
> requirement is limited. Also, the application writing to
> the file is blocked until the over the wire GETATTR is
> completed. This allows the GETATTR to be send and the
> response received without competing with the data being
> written.
>
> I did not do formal performance testing, but in my
> watching testing, I did not see any performance regressions.
>
> As a side note -- the more natural model of flow control
> would seem to be at the client/server level instead of
> the per-file level. However, that level was too coarse
> with the particular server that was required to be used
> because its requirements were at the per-file level.
>
> Thanx...
>
> ps
>
> Signed-off-by: Peter Staubach <[email protected]>
> --- linux-2.6.28.i686/fs/nfs/inode.c.org
> +++ linux-2.6.28.i686/fs/nfs/inode.c
> @@ -486,8 +486,10 @@ void nfs_setattr_update_inode(struct ino
> int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
> kstat *stat)
> {
> struct inode *inode = dentry->d_inode;
> - int need_atime = NFS_I(inode)->cache_validity &
> NFS_INO_INVALID_ATIME;
> + struct nfs_inode *nfsi = NFS_I(inode);
> + int need_atime = nfsi->cache_validity & NFS_INO_INVALID_ATIME;
> int err;
> + int inc_outstanding_writes = nfs_max_outstanding_writes;
>
> /*
> * Flush out writes to the server in order to update c/mtime.
> @@ -497,9 +499,8 @@ int nfs_getattr(struct vfsmount *mnt, st
> * nfs_wb_nocommit.
> */
> if (S_ISREG(inode->i_mode)) {
> - mutex_lock(&inode->i_mutex);
> + atomic_add(inc_outstanding_writes, &nfsi->writes);
> nfs_wb_nocommit(inode);
> - mutex_unlock(&inode->i_mutex);
> }
>
> /*
> @@ -523,6 +524,10 @@ int nfs_getattr(struct vfsmount *mnt, st
> generic_fillattr(inode, stat);
> stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
> }
> + if (S_ISREG(inode->i_mode)) {
> + atomic_sub(inc_outstanding_writes, &nfsi->writes);
> + wake_up(&nfsi->writes_wq);
> + }
> return err;
> }
>
> @@ -1288,9 +1293,13 @@ static void init_once(void *foo)
> INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
> nfsi->ncommit = 0;
> nfsi->npages = 0;
> + atomic_set(&nfsi->ndirty, 0);
> atomic_set(&nfsi->silly_count, 1);
> INIT_HLIST_HEAD(&nfsi->silly_list);
> init_waitqueue_head(&nfsi->waitqueue);
> + atomic_set(&nfsi->writes, 0);
> + init_waitqueue_head(&nfsi->writes_wq);
> + nfsi->wrpos = 0;
> nfs4_init_once(nfsi);
> }
>
> --- linux-2.6.28.i686/fs/nfs/write.c.org
> +++ linux-2.6.28.i686/fs/nfs/write.c
> @@ -197,7 +197,9 @@ static int nfs_set_page_writeback(struct
> if (!ret) {
> struct inode *inode = page->mapping->host;
> struct nfs_server *nfss = NFS_SERVER(inode);
> + struct nfs_inode *nfsi = NFS_I(inode);
>
> + atomic_dec(&nfsi->ndirty);
> if (atomic_long_inc_return(&nfss->writeback) >
> NFS_CONGESTION_ON_THRESH)
> set_bdi_congested(&nfss->backing_dev_info, WRITE);
> @@ -310,6 +312,33 @@ static int nfs_writepages_callback(struc
> return ret;
> }
>
> +int nfs_max_outstanding_writes = 16;
> +
> +static void nfs_inc_outstanding_writes(struct inode *inode)
> +{
> + struct nfs_inode *nfsi = NFS_I(inode);
> +
> + atomic_inc(&nfsi->writes);
> +}
> +
> +static void nfs_dec_outstanding_writes(struct inode *inode)
> +{
> + struct nfs_inode *nfsi = NFS_I(inode);
> +
> + if (atomic_dec_return(&nfsi->writes) < nfs_max_outstanding_writes)
> + wake_up(&nfsi->writes_wq);
> +}
> +
> +void nfs_wait_for_outstanding_writes(struct inode *inode)
> +{
> + struct nfs_inode *nfsi = NFS_I(inode);
> +
> + if (nfs_max_outstanding_writes) {
> + wait_event(nfsi->writes_wq,
> + atomic_read(&nfsi->writes) < nfs_max_outstanding_writes);
> + }
> +}
> +
> int nfs_writepages(struct address_space *mapping, struct
> writeback_control *wbc)
> {
> struct inode *inode = mapping->host;
> @@ -369,6 +398,7 @@ static int nfs_inode_add_request(struct
> SetPagePrivate(req->wb_page);
> set_page_private(req->wb_page, (unsigned long)req);
> nfsi->npages++;
> + atomic_inc(&nfsi->ndirty);
> kref_get(&req->wb_kref);
> radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
> NFS_PAGE_TAG_LOCKED);
> @@ -405,6 +435,10 @@ static void nfs_inode_remove_request(str
> static void
> nfs_mark_request_dirty(struct nfs_page *req)
> {
> + struct inode *inode = req->wb_context->path.dentry->d_inode;
> + struct nfs_inode *nfsi = NFS_I(inode);
> +
> + atomic_inc(&nfsi->ndirty);
> __set_page_dirty_nobuffers(req->wb_page);
> }
>
> @@ -633,6 +667,7 @@ static struct nfs_page *nfs_try_to_updat
> req->wb_bytes = end - req->wb_offset;
> else
> req->wb_bytes = rqend - req->wb_offset;
> + atomic_inc(&NFS_I(inode)->ndirty);
> out_unlock:
> spin_unlock(&inode->i_lock);
> return req;
> @@ -855,6 +890,8 @@ static int nfs_write_rpcsetup(struct nfs
> count,
> (unsigned long long)data->args.offset);
>
> + nfs_inc_outstanding_writes(inode);
> +
> task = rpc_run_task(&task_setup_data);
> if (IS_ERR(task))
> return PTR_ERR(task);
> @@ -1130,7 +1167,7 @@ int nfs_writeback_done(struct rpc_task *
> */
> status = NFS_PROTO(data->inode)->write_done(task, data);
> if (status != 0)
> - return status;
> + goto out;
> nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
>
> #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
> @@ -1186,7 +1223,9 @@ int nfs_writeback_done(struct rpc_task *
> /* Can't do anything about it except throw an error. */
> task->tk_status = -EIO;
> }
> - return 0;
> +out:
> + nfs_dec_outstanding_writes(data->inode);
> + return status;
> }
>
>
> @@ -1546,6 +1585,29 @@ int nfs_wb_page(struct inode *inode, str
> return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
> }
>
> +/*
> + * Start the WRITE requests for dirty pages on their way.
> + * This is used when a sufficient number of dirty pages
> + * have accumulated.
> + */
> +int nfs_wb_interim(struct inode *inode)
> +{
> + struct address_space *mapping = inode->i_mapping;
> + struct writeback_control wbc = {
> + .bdi = mapping->backing_dev_info,
> + .sync_mode = WB_SYNC_NONE,
> + .nr_to_write = LONG_MAX,
> + .range_start = 0,
> + .range_end = LLONG_MAX,
> + };
> + int ret;
> +
> + ret = nfs_writepages(mapping, &wbc);
> + if (ret < 0)
> + __mark_inode_dirty(inode, I_DIRTY_PAGES);
> + return ret;
> +}
> +
> int __init nfs_init_writepagecache(void)
> {
> nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
> --- linux-2.6.28.i686/fs/nfs/sysctl.c.org
> +++ linux-2.6.28.i686/fs/nfs/sysctl.c
> @@ -58,6 +58,14 @@ static ctl_table nfs_cb_sysctls[] = {
> .mode = 0644,
> .proc_handler = &proc_dointvec,
> },
> + {
> + .ctl_name = CTL_UNNUMBERED,
> + .procname = "nfs_max_outstanding_writes",
> + .data = &nfs_max_outstanding_writes,
> + .maxlen = sizeof(int),
> + .mode = 0644,
> + .proc_handler = &proc_dointvec,
> + },
> { .ctl_name = 0 }
> };
>
> --- linux-2.6.28.i686/fs/nfs/file.c.org
> +++ linux-2.6.28.i686/fs/nfs/file.c
> @@ -512,11 +512,17 @@ static int nfs_need_sync_write(struct fi
> return 0;
> }
>
> +static int nfs_is_serial(struct inode *inode, loff_t pos)
> +{
> + return NFS_I(inode)->wrpos == pos;
> +}
> +
> static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec
> *iov,
> unsigned long nr_segs, loff_t pos)
> {
> - struct dentry * dentry = iocb->ki_filp->f_path.dentry;
> - struct inode * inode = dentry->d_inode;
> + struct dentry *dentry = iocb->ki_filp->f_path.dentry;
> + struct inode *inode = dentry->d_inode;
> + struct nfs_inode *nfsi = NFS_I(inode);
> ssize_t result;
> size_t count = iov_length(iov, nr_segs);
>
> @@ -530,6 +536,13 @@ static ssize_t nfs_file_write(struct kio
> result = -EBUSY;
> if (IS_SWAPFILE(inode))
> goto out_swapfile;
> +
> + result = count;
> + if (!count)
> + goto out;
> +
> + nfs_wait_for_outstanding_writes(inode);
> +
> /*
> * O_APPEND implies that we must revalidate the file length.
> */
> @@ -539,17 +552,22 @@ static ssize_t nfs_file_write(struct kio
> goto out;
> }
>
> - result = count;
> - if (!count)
> - goto out;
> -
> nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
> result = generic_file_aio_write(iocb, iov, nr_segs, pos);
> /* Return error values for O_SYNC and IS_SYNC() */
> - if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
> - int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp),
> inode);
> - if (err < 0)
> - result = err;
> + if (result >= 0) {
> + if (nfs_need_sync_write(iocb->ki_filp, inode)) {
> + int err;
> + err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp),
> + inode);
> + if (err < 0)
> + result = err;
> + } else if (nfs_max_outstanding_writes &&
> + nfs_is_serial(inode, pos) &&
> + atomic_read(&nfsi->ndirty) >= NFS_SERVER(inode)->wpages)
> + nfs_wb_interim(inode);
> + if (result > 0)
> + nfsi->wrpos = pos + result;
> }
> out:
> return result;
> --- linux-2.6.28.i686/include/linux/nfs_fs.h.org
> +++ linux-2.6.28.i686/include/linux/nfs_fs.h
> @@ -168,6 +168,7 @@ struct nfs_inode {
>
> unsigned long ncommit,
> npages;
> + atomic_t ndirty;
>
> /* Open contexts for shared mmap writes */
> struct list_head open_files;
> @@ -186,6 +187,9 @@ struct nfs_inode {
> fmode_t delegation_state;
> struct rw_semaphore rwsem;
> #endif /* CONFIG_NFS_V4*/
> + atomic_t writes; /* number of outstanding WRITEs */
> + wait_queue_head_t writes_wq;
> + loff_t wrpos; /* position after last WRITE */
> struct inode vfs_inode;
> };
>
> @@ -459,12 +463,14 @@ extern void nfs_unblock_sillyrename(stru
> * linux/fs/nfs/write.c
> */
> extern int nfs_congestion_kb;
> +extern int nfs_max_outstanding_writes;
> extern int nfs_writepage(struct page *page, struct
> writeback_control *wbc);
> extern int nfs_writepages(struct address_space *, struct
> writeback_control *);
> extern int nfs_flush_incompatible(struct file *file, struct page
> *page);
> extern int nfs_updatepage(struct file *, struct page *, unsigned
> int, unsigned int);
> -extern int nfs_writeback_done(struct rpc_task *, struct
> nfs_write_data *);
> +extern int nfs_writeback_done(struct rpc_task *, struct
> nfs_write_data *);
> extern void nfs_writedata_release(void *);
> +extern void nfs_wait_for_outstanding_writes(struct inode *);
>
> /*
> * Try to write back everything synchronously (but check the
> @@ -475,6 +481,7 @@ extern int nfs_wb_all(struct inode *inod
> extern int nfs_wb_nocommit(struct inode *inode);
> extern int nfs_wb_page(struct inode *inode, struct page* page);
> extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
> +extern int nfs_wb_interim(struct inode *);
> #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
> extern int nfs_commit_inode(struct inode *, int);
> extern struct nfs_write_data *nfs_commitdata_alloc(void);

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com





2009-01-23 21:00:40

by Peter Staubach

[permalink] [raw]
Subject: Re: [PATCH] flow control for WRITE requests

Chuck Lever wrote:
> How does this play with memory pressure?
>
> I would expect the generic background page flushers, like pdflush and
> kswapd, could still cause out-of-order writes.
>

I believe that they all end up calling into the NFS client
code via the nfs_writepages() interface with the writeback
control struct set to write out pages sequentially, possibly
limiting the number of pages to be written, but usually not.

I ran this code on a 1GB system, writing out 2GB+ files, and
the only out of order issues that I saw were generated by
the pdflush and the application doing the writing itself
intermixing the generation of WRITE requests. This was
addressed by the introduction of NFS_INO_FLUSHING and the
code in nfs_writepages() which uses it.

I also ran the code on a 4GB system with the changes,
writing the same sized files, andno out of order WRITE
requests were seen.

I believe that Nick Piggin solved a bunch of these issues
recently and the NFS_INO_FLUSHING patch along with his
patches seems to have resolved the issues.

Thanx...

ps

> On Jan 23, 2009, at Jan 23, 2009, 2:10 PM, Peter Staubach wrote:
>
>> Hi.
>>
>> Attached is a patch which implements some flow control for the
>> NFS client to control dirty pages. The flow control is
>> implemented on a per-file basis and causes dirty pages to be
>> written out when the client can detect that the application is
>> writing in a serial fashion and has dirtied enough pages to
>> fill a complete over the wire transfer.
>>
>> This work was precipitated by working on a situation where a
>> server at a customer site was not able to adequately handle
>> the behavior of the Linux NFS client. This particular server
>> required that all data to the file written to the file be
>> written in a strictly serial fashion. It also had problems
>> handling the Linux NFS client semantic of caching a large
>> amount of data and then sending out that data all at once.
>>
>> The sequential ordering problem was resolved by a previous
>> patch which was submitted to the linux-nfs list. This patch
>> addresses the capacity problem.
>>
>> The problem is resolved by sending WRITE requests much
>> earlier in the process of the application writing to the file.
>> The client keeps track of the number of dirty pages associated
>> with the file and also the last offset of the data being
>> written. When the client detects that a full over the wire
>> transfer could be constructed and that the application is
>> writing sequentially, then it generates an UNSTABLE write to
>> server for the currently dirty data.
>>
>> The client also keeps track of the number of these WRITE
>> requests which have been generated. It flow controls based
>> on a configurable maximum. This keeps the client from
>> completely overwhelming the server.
>>
>> A nice side effect of the framework is that the issue of
>> stat()'ing a file being written can be handled much more
>> quickly than before. The amount of data that must be
>> transmitted to the server to satisfy the "latest mtime"
>> requirement is limited. Also, the application writing to
>> the file is blocked until the over the wire GETATTR is
>> completed. This allows the GETATTR to be send and the
>> response received without competing with the data being
>> written.
>>
>> I did not do formal performance testing, but in my
>> watching testing, I did not see any performance regressions.
>>
>> As a side note -- the more natural model of flow control
>> would seem to be at the client/server level instead of
>> the per-file level. However, that level was too coarse
>> with the particular server that was required to be used
>> because its requirements were at the per-file level.
>>
>> Thanx...
>>
>> ps
>>
>> Signed-off-by: Peter Staubach <[email protected]>
>> --- linux-2.6.28.i686/fs/nfs/inode.c.org
>> +++ linux-2.6.28.i686/fs/nfs/inode.c
>> @@ -486,8 +486,10 @@ void nfs_setattr_update_inode(struct ino
>> int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
>> kstat *stat)
>> {
>> struct inode *inode = dentry->d_inode;
>> - int need_atime = NFS_I(inode)->cache_validity &
>> NFS_INO_INVALID_ATIME;
>> + struct nfs_inode *nfsi = NFS_I(inode);
>> + int need_atime = nfsi->cache_validity & NFS_INO_INVALID_ATIME;
>> int err;
>> + int inc_outstanding_writes = nfs_max_outstanding_writes;
>>
>> /*
>> * Flush out writes to the server in order to update c/mtime.
>> @@ -497,9 +499,8 @@ int nfs_getattr(struct vfsmount *mnt, st
>> * nfs_wb_nocommit.
>> */
>> if (S_ISREG(inode->i_mode)) {
>> - mutex_lock(&inode->i_mutex);
>> + atomic_add(inc_outstanding_writes, &nfsi->writes);
>> nfs_wb_nocommit(inode);
>> - mutex_unlock(&inode->i_mutex);
>> }
>>
>> /*
>> @@ -523,6 +524,10 @@ int nfs_getattr(struct vfsmount *mnt, st
>> generic_fillattr(inode, stat);
>> stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
>> }
>> + if (S_ISREG(inode->i_mode)) {
>> + atomic_sub(inc_outstanding_writes, &nfsi->writes);
>> + wake_up(&nfsi->writes_wq);
>> + }
>> return err;
>> }
>>
>> @@ -1288,9 +1293,13 @@ static void init_once(void *foo)
>> INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
>> nfsi->ncommit = 0;
>> nfsi->npages = 0;
>> + atomic_set(&nfsi->ndirty, 0);
>> atomic_set(&nfsi->silly_count, 1);
>> INIT_HLIST_HEAD(&nfsi->silly_list);
>> init_waitqueue_head(&nfsi->waitqueue);
>> + atomic_set(&nfsi->writes, 0);
>> + init_waitqueue_head(&nfsi->writes_wq);
>> + nfsi->wrpos = 0;
>> nfs4_init_once(nfsi);
>> }
>>
>> --- linux-2.6.28.i686/fs/nfs/write.c.org
>> +++ linux-2.6.28.i686/fs/nfs/write.c
>> @@ -197,7 +197,9 @@ static int nfs_set_page_writeback(struct
>> if (!ret) {
>> struct inode *inode = page->mapping->host;
>> struct nfs_server *nfss = NFS_SERVER(inode);
>> + struct nfs_inode *nfsi = NFS_I(inode);
>>
>> + atomic_dec(&nfsi->ndirty);
>> if (atomic_long_inc_return(&nfss->writeback) >
>> NFS_CONGESTION_ON_THRESH)
>> set_bdi_congested(&nfss->backing_dev_info, WRITE);
>> @@ -310,6 +312,33 @@ static int nfs_writepages_callback(struc
>> return ret;
>> }
>>
>> +int nfs_max_outstanding_writes = 16;
>> +
>> +static void nfs_inc_outstanding_writes(struct inode *inode)
>> +{
>> + struct nfs_inode *nfsi = NFS_I(inode);
>> +
>> + atomic_inc(&nfsi->writes);
>> +}
>> +
>> +static void nfs_dec_outstanding_writes(struct inode *inode)
>> +{
>> + struct nfs_inode *nfsi = NFS_I(inode);
>> +
>> + if (atomic_dec_return(&nfsi->writes) < nfs_max_outstanding_writes)
>> + wake_up(&nfsi->writes_wq);
>> +}
>> +
>> +void nfs_wait_for_outstanding_writes(struct inode *inode)
>> +{
>> + struct nfs_inode *nfsi = NFS_I(inode);
>> +
>> + if (nfs_max_outstanding_writes) {
>> + wait_event(nfsi->writes_wq,
>> + atomic_read(&nfsi->writes) < nfs_max_outstanding_writes);
>> + }
>> +}
>> +
>> int nfs_writepages(struct address_space *mapping, struct
>> writeback_control *wbc)
>> {
>> struct inode *inode = mapping->host;
>> @@ -369,6 +398,7 @@ static int nfs_inode_add_request(struct
>> SetPagePrivate(req->wb_page);
>> set_page_private(req->wb_page, (unsigned long)req);
>> nfsi->npages++;
>> + atomic_inc(&nfsi->ndirty);
>> kref_get(&req->wb_kref);
>> radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
>> NFS_PAGE_TAG_LOCKED);
>> @@ -405,6 +435,10 @@ static void nfs_inode_remove_request(str
>> static void
>> nfs_mark_request_dirty(struct nfs_page *req)
>> {
>> + struct inode *inode = req->wb_context->path.dentry->d_inode;
>> + struct nfs_inode *nfsi = NFS_I(inode);
>> +
>> + atomic_inc(&nfsi->ndirty);
>> __set_page_dirty_nobuffers(req->wb_page);
>> }
>>
>> @@ -633,6 +667,7 @@ static struct nfs_page *nfs_try_to_updat
>> req->wb_bytes = end - req->wb_offset;
>> else
>> req->wb_bytes = rqend - req->wb_offset;
>> + atomic_inc(&NFS_I(inode)->ndirty);
>> out_unlock:
>> spin_unlock(&inode->i_lock);
>> return req;
>> @@ -855,6 +890,8 @@ static int nfs_write_rpcsetup(struct nfs
>> count,
>> (unsigned long long)data->args.offset);
>>
>> + nfs_inc_outstanding_writes(inode);
>> +
>> task = rpc_run_task(&task_setup_data);
>> if (IS_ERR(task))
>> return PTR_ERR(task);
>> @@ -1130,7 +1167,7 @@ int nfs_writeback_done(struct rpc_task *
>> */
>> status = NFS_PROTO(data->inode)->write_done(task, data);
>> if (status != 0)
>> - return status;
>> + goto out;
>> nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
>>
>> #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
>> @@ -1186,7 +1223,9 @@ int nfs_writeback_done(struct rpc_task *
>> /* Can't do anything about it except throw an error. */
>> task->tk_status = -EIO;
>> }
>> - return 0;
>> +out:
>> + nfs_dec_outstanding_writes(data->inode);
>> + return status;
>> }
>>
>>
>> @@ -1546,6 +1585,29 @@ int nfs_wb_page(struct inode *inode, str
>> return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
>> }
>>
>> +/*
>> + * Start the WRITE requests for dirty pages on their way.
>> + * This is used when a sufficient number of dirty pages
>> + * have accumulated.
>> + */
>> +int nfs_wb_interim(struct inode *inode)
>> +{
>> + struct address_space *mapping = inode->i_mapping;
>> + struct writeback_control wbc = {
>> + .bdi = mapping->backing_dev_info,
>> + .sync_mode = WB_SYNC_NONE,
>> + .nr_to_write = LONG_MAX,
>> + .range_start = 0,
>> + .range_end = LLONG_MAX,
>> + };
>> + int ret;
>> +
>> + ret = nfs_writepages(mapping, &wbc);
>> + if (ret < 0)
>> + __mark_inode_dirty(inode, I_DIRTY_PAGES);
>> + return ret;
>> +}
>> +
>> int __init nfs_init_writepagecache(void)
>> {
>> nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
>> --- linux-2.6.28.i686/fs/nfs/sysctl.c.org
>> +++ linux-2.6.28.i686/fs/nfs/sysctl.c
>> @@ -58,6 +58,14 @@ static ctl_table nfs_cb_sysctls[] = {
>> .mode = 0644,
>> .proc_handler = &proc_dointvec,
>> },
>> + {
>> + .ctl_name = CTL_UNNUMBERED,
>> + .procname = "nfs_max_outstanding_writes",
>> + .data = &nfs_max_outstanding_writes,
>> + .maxlen = sizeof(int),
>> + .mode = 0644,
>> + .proc_handler = &proc_dointvec,
>> + },
>> { .ctl_name = 0 }
>> };
>>
>> --- linux-2.6.28.i686/fs/nfs/file.c.org
>> +++ linux-2.6.28.i686/fs/nfs/file.c
>> @@ -512,11 +512,17 @@ static int nfs_need_sync_write(struct fi
>> return 0;
>> }
>>
>> +static int nfs_is_serial(struct inode *inode, loff_t pos)
>> +{
>> + return NFS_I(inode)->wrpos == pos;
>> +}
>> +
>> static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec
>> *iov,
>> unsigned long nr_segs, loff_t pos)
>> {
>> - struct dentry * dentry = iocb->ki_filp->f_path.dentry;
>> - struct inode * inode = dentry->d_inode;
>> + struct dentry *dentry = iocb->ki_filp->f_path.dentry;
>> + struct inode *inode = dentry->d_inode;
>> + struct nfs_inode *nfsi = NFS_I(inode);
>> ssize_t result;
>> size_t count = iov_length(iov, nr_segs);
>>
>> @@ -530,6 +536,13 @@ static ssize_t nfs_file_write(struct kio
>> result = -EBUSY;
>> if (IS_SWAPFILE(inode))
>> goto out_swapfile;
>> +
>> + result = count;
>> + if (!count)
>> + goto out;
>> +
>> + nfs_wait_for_outstanding_writes(inode);
>> +
>> /*
>> * O_APPEND implies that we must revalidate the file length.
>> */
>> @@ -539,17 +552,22 @@ static ssize_t nfs_file_write(struct kio
>> goto out;
>> }
>>
>> - result = count;
>> - if (!count)
>> - goto out;
>> -
>> nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
>> result = generic_file_aio_write(iocb, iov, nr_segs, pos);
>> /* Return error values for O_SYNC and IS_SYNC() */
>> - if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
>> - int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp),
>> inode);
>> - if (err < 0)
>> - result = err;
>> + if (result >= 0) {
>> + if (nfs_need_sync_write(iocb->ki_filp, inode)) {
>> + int err;
>> + err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp),
>> + inode);
>> + if (err < 0)
>> + result = err;
>> + } else if (nfs_max_outstanding_writes &&
>> + nfs_is_serial(inode, pos) &&
>> + atomic_read(&nfsi->ndirty) >= NFS_SERVER(inode)->wpages)
>> + nfs_wb_interim(inode);
>> + if (result > 0)
>> + nfsi->wrpos = pos + result;
>> }
>> out:
>> return result;
>> --- linux-2.6.28.i686/include/linux/nfs_fs.h.org
>> +++ linux-2.6.28.i686/include/linux/nfs_fs.h
>> @@ -168,6 +168,7 @@ struct nfs_inode {
>>
>> unsigned long ncommit,
>> npages;
>> + atomic_t ndirty;
>>
>> /* Open contexts for shared mmap writes */
>> struct list_head open_files;
>> @@ -186,6 +187,9 @@ struct nfs_inode {
>> fmode_t delegation_state;
>> struct rw_semaphore rwsem;
>> #endif /* CONFIG_NFS_V4*/
>> + atomic_t writes; /* number of outstanding WRITEs */
>> + wait_queue_head_t writes_wq;
>> + loff_t wrpos; /* position after last WRITE */
>> struct inode vfs_inode;
>> };
>>
>> @@ -459,12 +463,14 @@ extern void nfs_unblock_sillyrename(stru
>> * linux/fs/nfs/write.c
>> */
>> extern int nfs_congestion_kb;
>> +extern int nfs_max_outstanding_writes;
>> extern int nfs_writepage(struct page *page, struct writeback_control
>> *wbc);
>> extern int nfs_writepages(struct address_space *, struct
>> writeback_control *);
>> extern int nfs_flush_incompatible(struct file *file, struct page
>> *page);
>> extern int nfs_updatepage(struct file *, struct page *, unsigned
>> int, unsigned int);
>> -extern int nfs_writeback_done(struct rpc_task *, struct
>> nfs_write_data *);
>> +extern int nfs_writeback_done(struct rpc_task *, struct
>> nfs_write_data *);
>> extern void nfs_writedata_release(void *);
>> +extern void nfs_wait_for_outstanding_writes(struct inode *);
>>
>> /*
>> * Try to write back everything synchronously (but check the
>> @@ -475,6 +481,7 @@ extern int nfs_wb_all(struct inode *inod
>> extern int nfs_wb_nocommit(struct inode *inode);
>> extern int nfs_wb_page(struct inode *inode, struct page* page);
>> extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
>> +extern int nfs_wb_interim(struct inode *);
>> #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
>> extern int nfs_commit_inode(struct inode *, int);
>> extern struct nfs_write_data *nfs_commitdata_alloc(void);
>
> --Chuck Lever
> chuck[dot]lever[at]oracle[dot]com
>
>
>
>


2009-01-23 21:14:27

by Chuck Lever

[permalink] [raw]
Subject: Re: [PATCH] flow control for WRITE requests

On Jan 23, 2009, at Jan 23, 2009, 4:00 PM, Peter Staubach wrote:
> Chuck Lever wrote:
>> How does this play with memory pressure?
>>
>> I would expect the generic background page flushers, like pdflush
>> and kswapd, could still cause out-of-order writes.
>>
>
> I believe that they all end up calling into the NFS client
> code via the nfs_writepages() interface with the writeback
> control struct set to write out pages sequentially, possibly
> limiting the number of pages to be written, but usually not.
>
> I ran this code on a 1GB system, writing out 2GB+ files, and
> the only out of order issues that I saw were generated by
> the pdflush and the application doing the writing itself
> intermixing the generation of WRITE requests. This was
> addressed by the introduction of NFS_INO_FLUSHING and the
> code in nfs_writepages() which uses it.
>
> I also ran the code on a 4GB system with the changes,
> writing the same sized files, andno out of order WRITE
> requests were seen.
>
> I believe that Nick Piggin solved a bunch of these issues
> recently and the NFS_INO_FLUSHING patch along with his
> patches seems to have resolved the issues.

That's good news.

How about ^C -- any weirdness there? pages left pinned on the client,
fractured writes on the server, application hangs?

Have you tried this patch with a multi-threaded write-intensive async
I/O workload? I'm sure you know which one I'm referring to. ;-)

Would it make sense to key the new nfs_max_outstanding_writes sysctl
to the RPC slot table size for that mount point? Another tunable...
that doesn't self-tune... Ugh. How did you arrive at the default
value of 16?

CITI had been trying to improve write performance on transatlantic
10Gb links. Would be useful to test in that environment.

> Thanx...
>
> ps
>
>> On Jan 23, 2009, at Jan 23, 2009, 2:10 PM, Peter Staubach wrote:
>>
>>> Hi.
>>>
>>> Attached is a patch which implements some flow control for the
>>> NFS client to control dirty pages. The flow control is
>>> implemented on a per-file basis and causes dirty pages to be
>>> written out when the client can detect that the application is
>>> writing in a serial fashion and has dirtied enough pages to
>>> fill a complete over the wire transfer.
>>>
>>> This work was precipitated by working on a situation where a
>>> server at a customer site was not able to adequately handle
>>> the behavior of the Linux NFS client. This particular server
>>> required that all data to the file written to the file be
>>> written in a strictly serial fashion. It also had problems
>>> handling the Linux NFS client semantic of caching a large
>>> amount of data and then sending out that data all at once.
>>>
>>> The sequential ordering problem was resolved by a previous
>>> patch which was submitted to the linux-nfs list. This patch
>>> addresses the capacity problem.
>>>
>>> The problem is resolved by sending WRITE requests much
>>> earlier in the process of the application writing to the file.
>>> The client keeps track of the number of dirty pages associated
>>> with the file and also the last offset of the data being
>>> written. When the client detects that a full over the wire
>>> transfer could be constructed and that the application is
>>> writing sequentially, then it generates an UNSTABLE write to
>>> server for the currently dirty data.
>>>
>>> The client also keeps track of the number of these WRITE
>>> requests which have been generated. It flow controls based
>>> on a configurable maximum. This keeps the client from
>>> completely overwhelming the server.
>>>
>>> A nice side effect of the framework is that the issue of
>>> stat()'ing a file being written can be handled much more
>>> quickly than before. The amount of data that must be
>>> transmitted to the server to satisfy the "latest mtime"
>>> requirement is limited. Also, the application writing to
>>> the file is blocked until the over the wire GETATTR is
>>> completed. This allows the GETATTR to be send and the
>>> response received without competing with the data being
>>> written.
>>>
>>> I did not do formal performance testing, but in my
>>> watching testing, I did not see any performance regressions.
>>>
>>> As a side note -- the more natural model of flow control
>>> would seem to be at the client/server level instead of
>>> the per-file level. However, that level was too coarse
>>> with the particular server that was required to be used
>>> because its requirements were at the per-file level.
>>>
>>> Thanx...
>>>
>>> ps
>>>
>>> Signed-off-by: Peter Staubach <[email protected]>
>>> --- linux-2.6.28.i686/fs/nfs/inode.c.org
>>> +++ linux-2.6.28.i686/fs/nfs/inode.c
>>> @@ -486,8 +486,10 @@ void nfs_setattr_update_inode(struct ino
>>> int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
>>> struct kstat *stat)
>>> {
>>> struct inode *inode = dentry->d_inode;
>>> - int need_atime = NFS_I(inode)->cache_validity &
>>> NFS_INO_INVALID_ATIME;
>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>> + int need_atime = nfsi->cache_validity & NFS_INO_INVALID_ATIME;
>>> int err;
>>> + int inc_outstanding_writes = nfs_max_outstanding_writes;
>>>
>>> /*
>>> * Flush out writes to the server in order to update c/mtime.
>>> @@ -497,9 +499,8 @@ int nfs_getattr(struct vfsmount *mnt, st
>>> * nfs_wb_nocommit.
>>> */
>>> if (S_ISREG(inode->i_mode)) {
>>> - mutex_lock(&inode->i_mutex);
>>> + atomic_add(inc_outstanding_writes, &nfsi->writes);
>>> nfs_wb_nocommit(inode);
>>> - mutex_unlock(&inode->i_mutex);
>>> }
>>>
>>> /*
>>> @@ -523,6 +524,10 @@ int nfs_getattr(struct vfsmount *mnt, st
>>> generic_fillattr(inode, stat);
>>> stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
>>> }
>>> + if (S_ISREG(inode->i_mode)) {
>>> + atomic_sub(inc_outstanding_writes, &nfsi->writes);
>>> + wake_up(&nfsi->writes_wq);
>>> + }
>>> return err;
>>> }
>>>
>>> @@ -1288,9 +1293,13 @@ static void init_once(void *foo)
>>> INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
>>> nfsi->ncommit = 0;
>>> nfsi->npages = 0;
>>> + atomic_set(&nfsi->ndirty, 0);
>>> atomic_set(&nfsi->silly_count, 1);
>>> INIT_HLIST_HEAD(&nfsi->silly_list);
>>> init_waitqueue_head(&nfsi->waitqueue);
>>> + atomic_set(&nfsi->writes, 0);
>>> + init_waitqueue_head(&nfsi->writes_wq);
>>> + nfsi->wrpos = 0;
>>> nfs4_init_once(nfsi);
>>> }
>>>
>>> --- linux-2.6.28.i686/fs/nfs/write.c.org
>>> +++ linux-2.6.28.i686/fs/nfs/write.c
>>> @@ -197,7 +197,9 @@ static int nfs_set_page_writeback(struct
>>> if (!ret) {
>>> struct inode *inode = page->mapping->host;
>>> struct nfs_server *nfss = NFS_SERVER(inode);
>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>
>>> + atomic_dec(&nfsi->ndirty);
>>> if (atomic_long_inc_return(&nfss->writeback) >
>>> NFS_CONGESTION_ON_THRESH)
>>> set_bdi_congested(&nfss->backing_dev_info, WRITE);
>>> @@ -310,6 +312,33 @@ static int nfs_writepages_callback(struc
>>> return ret;
>>> }
>>>
>>> +int nfs_max_outstanding_writes = 16;
>>> +
>>> +static void nfs_inc_outstanding_writes(struct inode *inode)
>>> +{
>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>> +
>>> + atomic_inc(&nfsi->writes);
>>> +}
>>> +
>>> +static void nfs_dec_outstanding_writes(struct inode *inode)
>>> +{
>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>> +
>>> + if (atomic_dec_return(&nfsi->writes) <
>>> nfs_max_outstanding_writes)
>>> + wake_up(&nfsi->writes_wq);
>>> +}
>>> +
>>> +void nfs_wait_for_outstanding_writes(struct inode *inode)
>>> +{
>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>> +
>>> + if (nfs_max_outstanding_writes) {
>>> + wait_event(nfsi->writes_wq,
>>> + atomic_read(&nfsi->writes) <
>>> nfs_max_outstanding_writes);
>>> + }
>>> +}
>>> +
>>> int nfs_writepages(struct address_space *mapping, struct
>>> writeback_control *wbc)
>>> {
>>> struct inode *inode = mapping->host;
>>> @@ -369,6 +398,7 @@ static int nfs_inode_add_request(struct
>>> SetPagePrivate(req->wb_page);
>>> set_page_private(req->wb_page, (unsigned long)req);
>>> nfsi->npages++;
>>> + atomic_inc(&nfsi->ndirty);
>>> kref_get(&req->wb_kref);
>>> radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
>>> NFS_PAGE_TAG_LOCKED);
>>> @@ -405,6 +435,10 @@ static void nfs_inode_remove_request(str
>>> static void
>>> nfs_mark_request_dirty(struct nfs_page *req)
>>> {
>>> + struct inode *inode = req->wb_context->path.dentry->d_inode;
>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>> +
>>> + atomic_inc(&nfsi->ndirty);
>>> __set_page_dirty_nobuffers(req->wb_page);
>>> }
>>>
>>> @@ -633,6 +667,7 @@ static struct nfs_page *nfs_try_to_updat
>>> req->wb_bytes = end - req->wb_offset;
>>> else
>>> req->wb_bytes = rqend - req->wb_offset;
>>> + atomic_inc(&NFS_I(inode)->ndirty);
>>> out_unlock:
>>> spin_unlock(&inode->i_lock);
>>> return req;
>>> @@ -855,6 +890,8 @@ static int nfs_write_rpcsetup(struct nfs
>>> count,
>>> (unsigned long long)data->args.offset);
>>>
>>> + nfs_inc_outstanding_writes(inode);
>>> +
>>> task = rpc_run_task(&task_setup_data);
>>> if (IS_ERR(task))
>>> return PTR_ERR(task);
>>> @@ -1130,7 +1167,7 @@ int nfs_writeback_done(struct rpc_task *
>>> */
>>> status = NFS_PROTO(data->inode)->write_done(task, data);
>>> if (status != 0)
>>> - return status;
>>> + goto out;
>>> nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp-
>>> >count);
>>>
>>> #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
>>> @@ -1186,7 +1223,9 @@ int nfs_writeback_done(struct rpc_task *
>>> /* Can't do anything about it except throw an error. */
>>> task->tk_status = -EIO;
>>> }
>>> - return 0;
>>> +out:
>>> + nfs_dec_outstanding_writes(data->inode);
>>> + return status;
>>> }
>>>
>>>
>>> @@ -1546,6 +1585,29 @@ int nfs_wb_page(struct inode *inode, str
>>> return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
>>> }
>>>
>>> +/*
>>> + * Start the WRITE requests for dirty pages on their way.
>>> + * This is used when a sufficient number of dirty pages
>>> + * have accumulated.
>>> + */
>>> +int nfs_wb_interim(struct inode *inode)
>>> +{
>>> + struct address_space *mapping = inode->i_mapping;
>>> + struct writeback_control wbc = {
>>> + .bdi = mapping->backing_dev_info,
>>> + .sync_mode = WB_SYNC_NONE,
>>> + .nr_to_write = LONG_MAX,
>>> + .range_start = 0,
>>> + .range_end = LLONG_MAX,
>>> + };
>>> + int ret;
>>> +
>>> + ret = nfs_writepages(mapping, &wbc);
>>> + if (ret < 0)
>>> + __mark_inode_dirty(inode, I_DIRTY_PAGES);
>>> + return ret;
>>> +}
>>> +
>>> int __init nfs_init_writepagecache(void)
>>> {
>>> nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
>>> --- linux-2.6.28.i686/fs/nfs/sysctl.c.org
>>> +++ linux-2.6.28.i686/fs/nfs/sysctl.c
>>> @@ -58,6 +58,14 @@ static ctl_table nfs_cb_sysctls[] = {
>>> .mode = 0644,
>>> .proc_handler = &proc_dointvec,
>>> },
>>> + {
>>> + .ctl_name = CTL_UNNUMBERED,
>>> + .procname = "nfs_max_outstanding_writes",
>>> + .data = &nfs_max_outstanding_writes,
>>> + .maxlen = sizeof(int),
>>> + .mode = 0644,
>>> + .proc_handler = &proc_dointvec,
>>> + },
>>> { .ctl_name = 0 }
>>> };
>>>
>>> --- linux-2.6.28.i686/fs/nfs/file.c.org
>>> +++ linux-2.6.28.i686/fs/nfs/file.c
>>> @@ -512,11 +512,17 @@ static int nfs_need_sync_write(struct fi
>>> return 0;
>>> }
>>>
>>> +static int nfs_is_serial(struct inode *inode, loff_t pos)
>>> +{
>>> + return NFS_I(inode)->wrpos == pos;
>>> +}
>>> +
>>> static ssize_t nfs_file_write(struct kiocb *iocb, const struct
>>> iovec *iov,
>>> unsigned long nr_segs, loff_t pos)
>>> {
>>> - struct dentry * dentry = iocb->ki_filp->f_path.dentry;
>>> - struct inode * inode = dentry->d_inode;
>>> + struct dentry *dentry = iocb->ki_filp->f_path.dentry;
>>> + struct inode *inode = dentry->d_inode;
>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>> ssize_t result;
>>> size_t count = iov_length(iov, nr_segs);
>>>
>>> @@ -530,6 +536,13 @@ static ssize_t nfs_file_write(struct kio
>>> result = -EBUSY;
>>> if (IS_SWAPFILE(inode))
>>> goto out_swapfile;
>>> +
>>> + result = count;
>>> + if (!count)
>>> + goto out;
>>> +
>>> + nfs_wait_for_outstanding_writes(inode);
>>> +
>>> /*
>>> * O_APPEND implies that we must revalidate the file length.
>>> */
>>> @@ -539,17 +552,22 @@ static ssize_t nfs_file_write(struct kio
>>> goto out;
>>> }
>>>
>>> - result = count;
>>> - if (!count)
>>> - goto out;
>>> -
>>> nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
>>> result = generic_file_aio_write(iocb, iov, nr_segs, pos);
>>> /* Return error values for O_SYNC and IS_SYNC() */
>>> - if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
>>> - int err = nfs_do_fsync(nfs_file_open_context(iocb-
>>> >ki_filp), inode);
>>> - if (err < 0)
>>> - result = err;
>>> + if (result >= 0) {
>>> + if (nfs_need_sync_write(iocb->ki_filp, inode)) {
>>> + int err;
>>> + err = nfs_do_fsync(nfs_file_open_context(iocb-
>>> >ki_filp),
>>> + inode);
>>> + if (err < 0)
>>> + result = err;
>>> + } else if (nfs_max_outstanding_writes &&
>>> + nfs_is_serial(inode, pos) &&
>>> + atomic_read(&nfsi->ndirty) >= NFS_SERVER(inode)-
>>> >wpages)
>>> + nfs_wb_interim(inode);
>>> + if (result > 0)
>>> + nfsi->wrpos = pos + result;
>>> }
>>> out:
>>> return result;
>>> --- linux-2.6.28.i686/include/linux/nfs_fs.h.org
>>> +++ linux-2.6.28.i686/include/linux/nfs_fs.h
>>> @@ -168,6 +168,7 @@ struct nfs_inode {
>>>
>>> unsigned long ncommit,
>>> npages;
>>> + atomic_t ndirty;
>>>
>>> /* Open contexts for shared mmap writes */
>>> struct list_head open_files;
>>> @@ -186,6 +187,9 @@ struct nfs_inode {
>>> fmode_t delegation_state;
>>> struct rw_semaphore rwsem;
>>> #endif /* CONFIG_NFS_V4*/
>>> + atomic_t writes; /* number of outstanding WRITEs */
>>> + wait_queue_head_t writes_wq;
>>> + loff_t wrpos; /* position after last WRITE */
>>> struct inode vfs_inode;
>>> };
>>>
>>> @@ -459,12 +463,14 @@ extern void nfs_unblock_sillyrename(stru
>>> * linux/fs/nfs/write.c
>>> */
>>> extern int nfs_congestion_kb;
>>> +extern int nfs_max_outstanding_writes;
>>> extern int nfs_writepage(struct page *page, struct
>>> writeback_control *wbc);
>>> extern int nfs_writepages(struct address_space *, struct
>>> writeback_control *);
>>> extern int nfs_flush_incompatible(struct file *file, struct page
>>> *page);
>>> extern int nfs_updatepage(struct file *, struct page *, unsigned
>>> int, unsigned int);
>>> -extern int nfs_writeback_done(struct rpc_task *, struct
>>> nfs_write_data *);
>>> +extern int nfs_writeback_done(struct rpc_task *, struct
>>> nfs_write_data *);
>>> extern void nfs_writedata_release(void *);
>>> +extern void nfs_wait_for_outstanding_writes(struct inode *);
>>>
>>> /*
>>> * Try to write back everything synchronously (but check the
>>> @@ -475,6 +481,7 @@ extern int nfs_wb_all(struct inode *inod
>>> extern int nfs_wb_nocommit(struct inode *inode);
>>> extern int nfs_wb_page(struct inode *inode, struct page* page);
>>> extern int nfs_wb_page_cancel(struct inode *inode, struct page*
>>> page);
>>> +extern int nfs_wb_interim(struct inode *);
>>> #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
>>> extern int nfs_commit_inode(struct inode *, int);
>>> extern struct nfs_write_data *nfs_commitdata_alloc(void);

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com

2009-01-23 21:24:07

by Peter Staubach

[permalink] [raw]
Subject: Re: [PATCH] flow control for WRITE requests

Chuck Lever wrote:
> On Jan 23, 2009, at Jan 23, 2009, 4:00 PM, Peter Staubach wrote:
>> Chuck Lever wrote:
>>> How does this play with memory pressure?
>>>
>>> I would expect the generic background page flushers, like pdflush
>>> and kswapd, could still cause out-of-order writes.
>>>
>>
>> I believe that they all end up calling into the NFS client
>> code via the nfs_writepages() interface with the writeback
>> control struct set to write out pages sequentially, possibly
>> limiting the number of pages to be written, but usually not.
>>
>> I ran this code on a 1GB system, writing out 2GB+ files, and
>> the only out of order issues that I saw were generated by
>> the pdflush and the application doing the writing itself
>> intermixing the generation of WRITE requests. This was
>> addressed by the introduction of NFS_INO_FLUSHING and the
>> code in nfs_writepages() which uses it.
>>
>> I also ran the code on a 4GB system with the changes,
>> writing the same sized files, andno out of order WRITE
>> requests were seen.
>>
>> I believe that Nick Piggin solved a bunch of these issues
>> recently and the NFS_INO_FLUSHING patch along with his
>> patches seems to have resolved the issues.
>
> That's good news.
>
> How about ^C -- any weirdness there? pages left pinned on the client,
> fractured writes on the server, application hangs?
>

Nope, nothing like that. Signals should cause things to unwind
normally.

> Have you tried this patch with a multi-threaded write-intensive async
> I/O workload? I'm sure you know which one I'm referring to. ;-)
>

No, probably not. :-)

> Would it make sense to key the new nfs_max_outstanding_writes sysctl
> to the RPC slot table size for that mount point? Another tunable...
> that doesn't self-tune... Ugh. How did you arrive at the default
> value of 16?
>

Yes, I didn't like the new tunable either. I needed to be able
to adjust it though. For my situation, I had a 48GB memory
client, but the right value of the tunable seemed to be, 2.
Sigh.


> CITI had been trying to improve write performance on transatlantic
> 10Gb links. Would be useful to test in that environment.
>

That, I don't know.

ps


>> Thanx...
>>
>> ps
>>
>>> On Jan 23, 2009, at Jan 23, 2009, 2:10 PM, Peter Staubach wrote:
>>>
>>>> Hi.
>>>>
>>>> Attached is a patch which implements some flow control for the
>>>> NFS client to control dirty pages. The flow control is
>>>> implemented on a per-file basis and causes dirty pages to be
>>>> written out when the client can detect that the application is
>>>> writing in a serial fashion and has dirtied enough pages to
>>>> fill a complete over the wire transfer.
>>>>
>>>> This work was precipitated by working on a situation where a
>>>> server at a customer site was not able to adequately handle
>>>> the behavior of the Linux NFS client. This particular server
>>>> required that all data to the file written to the file be
>>>> written in a strictly serial fashion. It also had problems
>>>> handling the Linux NFS client semantic of caching a large
>>>> amount of data and then sending out that data all at once.
>>>>
>>>> The sequential ordering problem was resolved by a previous
>>>> patch which was submitted to the linux-nfs list. This patch
>>>> addresses the capacity problem.
>>>>
>>>> The problem is resolved by sending WRITE requests much
>>>> earlier in the process of the application writing to the file.
>>>> The client keeps track of the number of dirty pages associated
>>>> with the file and also the last offset of the data being
>>>> written. When the client detects that a full over the wire
>>>> transfer could be constructed and that the application is
>>>> writing sequentially, then it generates an UNSTABLE write to
>>>> server for the currently dirty data.
>>>>
>>>> The client also keeps track of the number of these WRITE
>>>> requests which have been generated. It flow controls based
>>>> on a configurable maximum. This keeps the client from
>>>> completely overwhelming the server.
>>>>
>>>> A nice side effect of the framework is that the issue of
>>>> stat()'ing a file being written can be handled much more
>>>> quickly than before. The amount of data that must be
>>>> transmitted to the server to satisfy the "latest mtime"
>>>> requirement is limited. Also, the application writing to
>>>> the file is blocked until the over the wire GETATTR is
>>>> completed. This allows the GETATTR to be send and the
>>>> response received without competing with the data being
>>>> written.
>>>>
>>>> I did not do formal performance testing, but in my
>>>> watching testing, I did not see any performance regressions.
>>>>
>>>> As a side note -- the more natural model of flow control
>>>> would seem to be at the client/server level instead of
>>>> the per-file level. However, that level was too coarse
>>>> with the particular server that was required to be used
>>>> because its requirements were at the per-file level.
>>>>
>>>> Thanx...
>>>>
>>>> ps
>>>>
>>>> Signed-off-by: Peter Staubach <[email protected]>
>>>> --- linux-2.6.28.i686/fs/nfs/inode.c.org
>>>> +++ linux-2.6.28.i686/fs/nfs/inode.c
>>>> @@ -486,8 +486,10 @@ void nfs_setattr_update_inode(struct ino
>>>> int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
>>>> kstat *stat)
>>>> {
>>>> struct inode *inode = dentry->d_inode;
>>>> - int need_atime = NFS_I(inode)->cache_validity &
>>>> NFS_INO_INVALID_ATIME;
>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>> + int need_atime = nfsi->cache_validity & NFS_INO_INVALID_ATIME;
>>>> int err;
>>>> + int inc_outstanding_writes = nfs_max_outstanding_writes;
>>>>
>>>> /*
>>>> * Flush out writes to the server in order to update c/mtime.
>>>> @@ -497,9 +499,8 @@ int nfs_getattr(struct vfsmount *mnt, st
>>>> * nfs_wb_nocommit.
>>>> */
>>>> if (S_ISREG(inode->i_mode)) {
>>>> - mutex_lock(&inode->i_mutex);
>>>> + atomic_add(inc_outstanding_writes, &nfsi->writes);
>>>> nfs_wb_nocommit(inode);
>>>> - mutex_unlock(&inode->i_mutex);
>>>> }
>>>>
>>>> /*
>>>> @@ -523,6 +524,10 @@ int nfs_getattr(struct vfsmount *mnt, st
>>>> generic_fillattr(inode, stat);
>>>> stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
>>>> }
>>>> + if (S_ISREG(inode->i_mode)) {
>>>> + atomic_sub(inc_outstanding_writes, &nfsi->writes);
>>>> + wake_up(&nfsi->writes_wq);
>>>> + }
>>>> return err;
>>>> }
>>>>
>>>> @@ -1288,9 +1293,13 @@ static void init_once(void *foo)
>>>> INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
>>>> nfsi->ncommit = 0;
>>>> nfsi->npages = 0;
>>>> + atomic_set(&nfsi->ndirty, 0);
>>>> atomic_set(&nfsi->silly_count, 1);
>>>> INIT_HLIST_HEAD(&nfsi->silly_list);
>>>> init_waitqueue_head(&nfsi->waitqueue);
>>>> + atomic_set(&nfsi->writes, 0);
>>>> + init_waitqueue_head(&nfsi->writes_wq);
>>>> + nfsi->wrpos = 0;
>>>> nfs4_init_once(nfsi);
>>>> }
>>>>
>>>> --- linux-2.6.28.i686/fs/nfs/write.c.org
>>>> +++ linux-2.6.28.i686/fs/nfs/write.c
>>>> @@ -197,7 +197,9 @@ static int nfs_set_page_writeback(struct
>>>> if (!ret) {
>>>> struct inode *inode = page->mapping->host;
>>>> struct nfs_server *nfss = NFS_SERVER(inode);
>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>>
>>>> + atomic_dec(&nfsi->ndirty);
>>>> if (atomic_long_inc_return(&nfss->writeback) >
>>>> NFS_CONGESTION_ON_THRESH)
>>>> set_bdi_congested(&nfss->backing_dev_info, WRITE);
>>>> @@ -310,6 +312,33 @@ static int nfs_writepages_callback(struc
>>>> return ret;
>>>> }
>>>>
>>>> +int nfs_max_outstanding_writes = 16;
>>>> +
>>>> +static void nfs_inc_outstanding_writes(struct inode *inode)
>>>> +{
>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>> +
>>>> + atomic_inc(&nfsi->writes);
>>>> +}
>>>> +
>>>> +static void nfs_dec_outstanding_writes(struct inode *inode)
>>>> +{
>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>> +
>>>> + if (atomic_dec_return(&nfsi->writes) <
>>>> nfs_max_outstanding_writes)
>>>> + wake_up(&nfsi->writes_wq);
>>>> +}
>>>> +
>>>> +void nfs_wait_for_outstanding_writes(struct inode *inode)
>>>> +{
>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>> +
>>>> + if (nfs_max_outstanding_writes) {
>>>> + wait_event(nfsi->writes_wq,
>>>> + atomic_read(&nfsi->writes) < nfs_max_outstanding_writes);
>>>> + }
>>>> +}
>>>> +
>>>> int nfs_writepages(struct address_space *mapping, struct
>>>> writeback_control *wbc)
>>>> {
>>>> struct inode *inode = mapping->host;
>>>> @@ -369,6 +398,7 @@ static int nfs_inode_add_request(struct
>>>> SetPagePrivate(req->wb_page);
>>>> set_page_private(req->wb_page, (unsigned long)req);
>>>> nfsi->npages++;
>>>> + atomic_inc(&nfsi->ndirty);
>>>> kref_get(&req->wb_kref);
>>>> radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
>>>> NFS_PAGE_TAG_LOCKED);
>>>> @@ -405,6 +435,10 @@ static void nfs_inode_remove_request(str
>>>> static void
>>>> nfs_mark_request_dirty(struct nfs_page *req)
>>>> {
>>>> + struct inode *inode = req->wb_context->path.dentry->d_inode;
>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>> +
>>>> + atomic_inc(&nfsi->ndirty);
>>>> __set_page_dirty_nobuffers(req->wb_page);
>>>> }
>>>>
>>>> @@ -633,6 +667,7 @@ static struct nfs_page *nfs_try_to_updat
>>>> req->wb_bytes = end - req->wb_offset;
>>>> else
>>>> req->wb_bytes = rqend - req->wb_offset;
>>>> + atomic_inc(&NFS_I(inode)->ndirty);
>>>> out_unlock:
>>>> spin_unlock(&inode->i_lock);
>>>> return req;
>>>> @@ -855,6 +890,8 @@ static int nfs_write_rpcsetup(struct nfs
>>>> count,
>>>> (unsigned long long)data->args.offset);
>>>>
>>>> + nfs_inc_outstanding_writes(inode);
>>>> +
>>>> task = rpc_run_task(&task_setup_data);
>>>> if (IS_ERR(task))
>>>> return PTR_ERR(task);
>>>> @@ -1130,7 +1167,7 @@ int nfs_writeback_done(struct rpc_task *
>>>> */
>>>> status = NFS_PROTO(data->inode)->write_done(task, data);
>>>> if (status != 0)
>>>> - return status;
>>>> + goto out;
>>>> nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
>>>>
>>>> #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
>>>> @@ -1186,7 +1223,9 @@ int nfs_writeback_done(struct rpc_task *
>>>> /* Can't do anything about it except throw an error. */
>>>> task->tk_status = -EIO;
>>>> }
>>>> - return 0;
>>>> +out:
>>>> + nfs_dec_outstanding_writes(data->inode);
>>>> + return status;
>>>> }
>>>>
>>>>
>>>> @@ -1546,6 +1585,29 @@ int nfs_wb_page(struct inode *inode, str
>>>> return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
>>>> }
>>>>
>>>> +/*
>>>> + * Start the WRITE requests for dirty pages on their way.
>>>> + * This is used when a sufficient number of dirty pages
>>>> + * have accumulated.
>>>> + */
>>>> +int nfs_wb_interim(struct inode *inode)
>>>> +{
>>>> + struct address_space *mapping = inode->i_mapping;
>>>> + struct writeback_control wbc = {
>>>> + .bdi = mapping->backing_dev_info,
>>>> + .sync_mode = WB_SYNC_NONE,
>>>> + .nr_to_write = LONG_MAX,
>>>> + .range_start = 0,
>>>> + .range_end = LLONG_MAX,
>>>> + };
>>>> + int ret;
>>>> +
>>>> + ret = nfs_writepages(mapping, &wbc);
>>>> + if (ret < 0)
>>>> + __mark_inode_dirty(inode, I_DIRTY_PAGES);
>>>> + return ret;
>>>> +}
>>>> +
>>>> int __init nfs_init_writepagecache(void)
>>>> {
>>>> nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
>>>> --- linux-2.6.28.i686/fs/nfs/sysctl.c.org
>>>> +++ linux-2.6.28.i686/fs/nfs/sysctl.c
>>>> @@ -58,6 +58,14 @@ static ctl_table nfs_cb_sysctls[] = {
>>>> .mode = 0644,
>>>> .proc_handler = &proc_dointvec,
>>>> },
>>>> + {
>>>> + .ctl_name = CTL_UNNUMBERED,
>>>> + .procname = "nfs_max_outstanding_writes",
>>>> + .data = &nfs_max_outstanding_writes,
>>>> + .maxlen = sizeof(int),
>>>> + .mode = 0644,
>>>> + .proc_handler = &proc_dointvec,
>>>> + },
>>>> { .ctl_name = 0 }
>>>> };
>>>>
>>>> --- linux-2.6.28.i686/fs/nfs/file.c.org
>>>> +++ linux-2.6.28.i686/fs/nfs/file.c
>>>> @@ -512,11 +512,17 @@ static int nfs_need_sync_write(struct fi
>>>> return 0;
>>>> }
>>>>
>>>> +static int nfs_is_serial(struct inode *inode, loff_t pos)
>>>> +{
>>>> + return NFS_I(inode)->wrpos == pos;
>>>> +}
>>>> +
>>>> static ssize_t nfs_file_write(struct kiocb *iocb, const struct
>>>> iovec *iov,
>>>> unsigned long nr_segs, loff_t pos)
>>>> {
>>>> - struct dentry * dentry = iocb->ki_filp->f_path.dentry;
>>>> - struct inode * inode = dentry->d_inode;
>>>> + struct dentry *dentry = iocb->ki_filp->f_path.dentry;
>>>> + struct inode *inode = dentry->d_inode;
>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>> ssize_t result;
>>>> size_t count = iov_length(iov, nr_segs);
>>>>
>>>> @@ -530,6 +536,13 @@ static ssize_t nfs_file_write(struct kio
>>>> result = -EBUSY;
>>>> if (IS_SWAPFILE(inode))
>>>> goto out_swapfile;
>>>> +
>>>> + result = count;
>>>> + if (!count)
>>>> + goto out;
>>>> +
>>>> + nfs_wait_for_outstanding_writes(inode);
>>>> +
>>>> /*
>>>> * O_APPEND implies that we must revalidate the file length.
>>>> */
>>>> @@ -539,17 +552,22 @@ static ssize_t nfs_file_write(struct kio
>>>> goto out;
>>>> }
>>>>
>>>> - result = count;
>>>> - if (!count)
>>>> - goto out;
>>>> -
>>>> nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
>>>> result = generic_file_aio_write(iocb, iov, nr_segs, pos);
>>>> /* Return error values for O_SYNC and IS_SYNC() */
>>>> - if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
>>>> - int err =
>>>> nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
>>>> - if (err < 0)
>>>> - result = err;
>>>> + if (result >= 0) {
>>>> + if (nfs_need_sync_write(iocb->ki_filp, inode)) {
>>>> + int err;
>>>> + err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp),
>>>> + inode);
>>>> + if (err < 0)
>>>> + result = err;
>>>> + } else if (nfs_max_outstanding_writes &&
>>>> + nfs_is_serial(inode, pos) &&
>>>> + atomic_read(&nfsi->ndirty) >= NFS_SERVER(inode)->wpages)
>>>> + nfs_wb_interim(inode);
>>>> + if (result > 0)
>>>> + nfsi->wrpos = pos + result;
>>>> }
>>>> out:
>>>> return result;
>>>> --- linux-2.6.28.i686/include/linux/nfs_fs.h.org
>>>> +++ linux-2.6.28.i686/include/linux/nfs_fs.h
>>>> @@ -168,6 +168,7 @@ struct nfs_inode {
>>>>
>>>> unsigned long ncommit,
>>>> npages;
>>>> + atomic_t ndirty;
>>>>
>>>> /* Open contexts for shared mmap writes */
>>>> struct list_head open_files;
>>>> @@ -186,6 +187,9 @@ struct nfs_inode {
>>>> fmode_t delegation_state;
>>>> struct rw_semaphore rwsem;
>>>> #endif /* CONFIG_NFS_V4*/
>>>> + atomic_t writes; /* number of outstanding WRITEs */
>>>> + wait_queue_head_t writes_wq;
>>>> + loff_t wrpos; /* position after last WRITE */
>>>> struct inode vfs_inode;
>>>> };
>>>>
>>>> @@ -459,12 +463,14 @@ extern void nfs_unblock_sillyrename(stru
>>>> * linux/fs/nfs/write.c
>>>> */
>>>> extern int nfs_congestion_kb;
>>>> +extern int nfs_max_outstanding_writes;
>>>> extern int nfs_writepage(struct page *page, struct
>>>> writeback_control *wbc);
>>>> extern int nfs_writepages(struct address_space *, struct
>>>> writeback_control *);
>>>> extern int nfs_flush_incompatible(struct file *file, struct page
>>>> *page);
>>>> extern int nfs_updatepage(struct file *, struct page *, unsigned
>>>> int, unsigned int);
>>>> -extern int nfs_writeback_done(struct rpc_task *, struct
>>>> nfs_write_data *);
>>>> +extern int nfs_writeback_done(struct rpc_task *, struct
>>>> nfs_write_data *);
>>>> extern void nfs_writedata_release(void *);
>>>> +extern void nfs_wait_for_outstanding_writes(struct inode *);
>>>>
>>>> /*
>>>> * Try to write back everything synchronously (but check the
>>>> @@ -475,6 +481,7 @@ extern int nfs_wb_all(struct inode *inod
>>>> extern int nfs_wb_nocommit(struct inode *inode);
>>>> extern int nfs_wb_page(struct inode *inode, struct page* page);
>>>> extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
>>>> +extern int nfs_wb_interim(struct inode *);
>>>> #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
>>>> extern int nfs_commit_inode(struct inode *, int);
>>>> extern struct nfs_write_data *nfs_commitdata_alloc(void);
>
> --
> Chuck Lever
> chuck[dot]lever[at]oracle[dot]com


2009-01-23 21:29:26

by Peter Staubach

[permalink] [raw]
Subject: Re: [PATCH] flow control for WRITE requests

Chuck Lever wrote:
> On Jan 23, 2009, at Jan 23, 2009, 4:00 PM, Peter Staubach wrote:
>> Chuck Lever wrote:
>>> How does this play with memory pressure?
>>>
>>> I would expect the generic background page flushers, like pdflush
>>> and kswapd, could still cause out-of-order writes.
>>>
>>
>> I believe that they all end up calling into the NFS client
>> code via the nfs_writepages() interface with the writeback
>> control struct set to write out pages sequentially, possibly
>> limiting the number of pages to be written, but usually not.
>>
>> I ran this code on a 1GB system, writing out 2GB+ files, and
>> the only out of order issues that I saw were generated by
>> the pdflush and the application doing the writing itself
>> intermixing the generation of WRITE requests. This was
>> addressed by the introduction of NFS_INO_FLUSHING and the
>> code in nfs_writepages() which uses it.
>>
>> I also ran the code on a 4GB system with the changes,
>> writing the same sized files, andno out of order WRITE
>> requests were seen.
>>
>> I believe that Nick Piggin solved a bunch of these issues
>> recently and the NFS_INO_FLUSHING patch along with his
>> patches seems to have resolved the issues.
>
> That's good news.
>
> How about ^C -- any weirdness there? pages left pinned on the client,
> fractured writes on the server, application hangs?
>
> Have you tried this patch with a multi-threaded write-intensive async
> I/O workload? I'm sure you know which one I'm referring to. ;-)
>
> Would it make sense to key the new nfs_max_outstanding_writes sysctl
> to the RPC slot table size for that mount point? Another tunable...
> that doesn't self-tune... Ugh. How did you arrive at the default
> value of 16?
>

Sorry, forgot to answer your last question here. Mostly,
I just picked it out of the air? Other systems default to
8, but that seemed a little small, so I figured that 16, 32K
WRITE requests outstanding should be a reasonable amount of
outstanding data.

Not scientific at all. I am open to suggestions for
autotuning, etc.

Thanx...

ps

> CITI had been trying to improve write performance on transatlantic
> 10Gb links. Would be useful to test in that environment.
>
>> Thanx...
>>
>> ps
>>
>>> On Jan 23, 2009, at Jan 23, 2009, 2:10 PM, Peter Staubach wrote:
>>>
>>>> Hi.
>>>>
>>>> Attached is a patch which implements some flow control for the
>>>> NFS client to control dirty pages. The flow control is
>>>> implemented on a per-file basis and causes dirty pages to be
>>>> written out when the client can detect that the application is
>>>> writing in a serial fashion and has dirtied enough pages to
>>>> fill a complete over the wire transfer.
>>>>
>>>> This work was precipitated by working on a situation where a
>>>> server at a customer site was not able to adequately handle
>>>> the behavior of the Linux NFS client. This particular server
>>>> required that all data to the file written to the file be
>>>> written in a strictly serial fashion. It also had problems
>>>> handling the Linux NFS client semantic of caching a large
>>>> amount of data and then sending out that data all at once.
>>>>
>>>> The sequential ordering problem was resolved by a previous
>>>> patch which was submitted to the linux-nfs list. This patch
>>>> addresses the capacity problem.
>>>>
>>>> The problem is resolved by sending WRITE requests much
>>>> earlier in the process of the application writing to the file.
>>>> The client keeps track of the number of dirty pages associated
>>>> with the file and also the last offset of the data being
>>>> written. When the client detects that a full over the wire
>>>> transfer could be constructed and that the application is
>>>> writing sequentially, then it generates an UNSTABLE write to
>>>> server for the currently dirty data.
>>>>
>>>> The client also keeps track of the number of these WRITE
>>>> requests which have been generated. It flow controls based
>>>> on a configurable maximum. This keeps the client from
>>>> completely overwhelming the server.
>>>>
>>>> A nice side effect of the framework is that the issue of
>>>> stat()'ing a file being written can be handled much more
>>>> quickly than before. The amount of data that must be
>>>> transmitted to the server to satisfy the "latest mtime"
>>>> requirement is limited. Also, the application writing to
>>>> the file is blocked until the over the wire GETATTR is
>>>> completed. This allows the GETATTR to be send and the
>>>> response received without competing with the data being
>>>> written.
>>>>
>>>> I did not do formal performance testing, but in my
>>>> watching testing, I did not see any performance regressions.
>>>>
>>>> As a side note -- the more natural model of flow control
>>>> would seem to be at the client/server level instead of
>>>> the per-file level. However, that level was too coarse
>>>> with the particular server that was required to be used
>>>> because its requirements were at the per-file level.
>>>>
>>>> Thanx...
>>>>
>>>> ps
>>>>
>>>> Signed-off-by: Peter Staubach <[email protected]>
>>>> --- linux-2.6.28.i686/fs/nfs/inode.c.org
>>>> +++ linux-2.6.28.i686/fs/nfs/inode.c
>>>> @@ -486,8 +486,10 @@ void nfs_setattr_update_inode(struct ino
>>>> int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
>>>> kstat *stat)
>>>> {
>>>> struct inode *inode = dentry->d_inode;
>>>> - int need_atime = NFS_I(inode)->cache_validity &
>>>> NFS_INO_INVALID_ATIME;
>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>> + int need_atime = nfsi->cache_validity & NFS_INO_INVALID_ATIME;
>>>> int err;
>>>> + int inc_outstanding_writes = nfs_max_outstanding_writes;
>>>>
>>>> /*
>>>> * Flush out writes to the server in order to update c/mtime.
>>>> @@ -497,9 +499,8 @@ int nfs_getattr(struct vfsmount *mnt, st
>>>> * nfs_wb_nocommit.
>>>> */
>>>> if (S_ISREG(inode->i_mode)) {
>>>> - mutex_lock(&inode->i_mutex);
>>>> + atomic_add(inc_outstanding_writes, &nfsi->writes);
>>>> nfs_wb_nocommit(inode);
>>>> - mutex_unlock(&inode->i_mutex);
>>>> }
>>>>
>>>> /*
>>>> @@ -523,6 +524,10 @@ int nfs_getattr(struct vfsmount *mnt, st
>>>> generic_fillattr(inode, stat);
>>>> stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
>>>> }
>>>> + if (S_ISREG(inode->i_mode)) {
>>>> + atomic_sub(inc_outstanding_writes, &nfsi->writes);
>>>> + wake_up(&nfsi->writes_wq);
>>>> + }
>>>> return err;
>>>> }
>>>>
>>>> @@ -1288,9 +1293,13 @@ static void init_once(void *foo)
>>>> INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
>>>> nfsi->ncommit = 0;
>>>> nfsi->npages = 0;
>>>> + atomic_set(&nfsi->ndirty, 0);
>>>> atomic_set(&nfsi->silly_count, 1);
>>>> INIT_HLIST_HEAD(&nfsi->silly_list);
>>>> init_waitqueue_head(&nfsi->waitqueue);
>>>> + atomic_set(&nfsi->writes, 0);
>>>> + init_waitqueue_head(&nfsi->writes_wq);
>>>> + nfsi->wrpos = 0;
>>>> nfs4_init_once(nfsi);
>>>> }
>>>>
>>>> --- linux-2.6.28.i686/fs/nfs/write.c.org
>>>> +++ linux-2.6.28.i686/fs/nfs/write.c
>>>> @@ -197,7 +197,9 @@ static int nfs_set_page_writeback(struct
>>>> if (!ret) {
>>>> struct inode *inode = page->mapping->host;
>>>> struct nfs_server *nfss = NFS_SERVER(inode);
>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>>
>>>> + atomic_dec(&nfsi->ndirty);
>>>> if (atomic_long_inc_return(&nfss->writeback) >
>>>> NFS_CONGESTION_ON_THRESH)
>>>> set_bdi_congested(&nfss->backing_dev_info, WRITE);
>>>> @@ -310,6 +312,33 @@ static int nfs_writepages_callback(struc
>>>> return ret;
>>>> }
>>>>
>>>> +int nfs_max_outstanding_writes = 16;
>>>> +
>>>> +static void nfs_inc_outstanding_writes(struct inode *inode)
>>>> +{
>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>> +
>>>> + atomic_inc(&nfsi->writes);
>>>> +}
>>>> +
>>>> +static void nfs_dec_outstanding_writes(struct inode *inode)
>>>> +{
>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>> +
>>>> + if (atomic_dec_return(&nfsi->writes) <
>>>> nfs_max_outstanding_writes)
>>>> + wake_up(&nfsi->writes_wq);
>>>> +}
>>>> +
>>>> +void nfs_wait_for_outstanding_writes(struct inode *inode)
>>>> +{
>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>> +
>>>> + if (nfs_max_outstanding_writes) {
>>>> + wait_event(nfsi->writes_wq,
>>>> + atomic_read(&nfsi->writes) < nfs_max_outstanding_writes);
>>>> + }
>>>> +}
>>>> +
>>>> int nfs_writepages(struct address_space *mapping, struct
>>>> writeback_control *wbc)
>>>> {
>>>> struct inode *inode = mapping->host;
>>>> @@ -369,6 +398,7 @@ static int nfs_inode_add_request(struct
>>>> SetPagePrivate(req->wb_page);
>>>> set_page_private(req->wb_page, (unsigned long)req);
>>>> nfsi->npages++;
>>>> + atomic_inc(&nfsi->ndirty);
>>>> kref_get(&req->wb_kref);
>>>> radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
>>>> NFS_PAGE_TAG_LOCKED);
>>>> @@ -405,6 +435,10 @@ static void nfs_inode_remove_request(str
>>>> static void
>>>> nfs_mark_request_dirty(struct nfs_page *req)
>>>> {
>>>> + struct inode *inode = req->wb_context->path.dentry->d_inode;
>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>> +
>>>> + atomic_inc(&nfsi->ndirty);
>>>> __set_page_dirty_nobuffers(req->wb_page);
>>>> }
>>>>
>>>> @@ -633,6 +667,7 @@ static struct nfs_page *nfs_try_to_updat
>>>> req->wb_bytes = end - req->wb_offset;
>>>> else
>>>> req->wb_bytes = rqend - req->wb_offset;
>>>> + atomic_inc(&NFS_I(inode)->ndirty);
>>>> out_unlock:
>>>> spin_unlock(&inode->i_lock);
>>>> return req;
>>>> @@ -855,6 +890,8 @@ static int nfs_write_rpcsetup(struct nfs
>>>> count,
>>>> (unsigned long long)data->args.offset);
>>>>
>>>> + nfs_inc_outstanding_writes(inode);
>>>> +
>>>> task = rpc_run_task(&task_setup_data);
>>>> if (IS_ERR(task))
>>>> return PTR_ERR(task);
>>>> @@ -1130,7 +1167,7 @@ int nfs_writeback_done(struct rpc_task *
>>>> */
>>>> status = NFS_PROTO(data->inode)->write_done(task, data);
>>>> if (status != 0)
>>>> - return status;
>>>> + goto out;
>>>> nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
>>>>
>>>> #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
>>>> @@ -1186,7 +1223,9 @@ int nfs_writeback_done(struct rpc_task *
>>>> /* Can't do anything about it except throw an error. */
>>>> task->tk_status = -EIO;
>>>> }
>>>> - return 0;
>>>> +out:
>>>> + nfs_dec_outstanding_writes(data->inode);
>>>> + return status;
>>>> }
>>>>
>>>>
>>>> @@ -1546,6 +1585,29 @@ int nfs_wb_page(struct inode *inode, str
>>>> return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
>>>> }
>>>>
>>>> +/*
>>>> + * Start the WRITE requests for dirty pages on their way.
>>>> + * This is used when a sufficient number of dirty pages
>>>> + * have accumulated.
>>>> + */
>>>> +int nfs_wb_interim(struct inode *inode)
>>>> +{
>>>> + struct address_space *mapping = inode->i_mapping;
>>>> + struct writeback_control wbc = {
>>>> + .bdi = mapping->backing_dev_info,
>>>> + .sync_mode = WB_SYNC_NONE,
>>>> + .nr_to_write = LONG_MAX,
>>>> + .range_start = 0,
>>>> + .range_end = LLONG_MAX,
>>>> + };
>>>> + int ret;
>>>> +
>>>> + ret = nfs_writepages(mapping, &wbc);
>>>> + if (ret < 0)
>>>> + __mark_inode_dirty(inode, I_DIRTY_PAGES);
>>>> + return ret;
>>>> +}
>>>> +
>>>> int __init nfs_init_writepagecache(void)
>>>> {
>>>> nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
>>>> --- linux-2.6.28.i686/fs/nfs/sysctl.c.org
>>>> +++ linux-2.6.28.i686/fs/nfs/sysctl.c
>>>> @@ -58,6 +58,14 @@ static ctl_table nfs_cb_sysctls[] = {
>>>> .mode = 0644,
>>>> .proc_handler = &proc_dointvec,
>>>> },
>>>> + {
>>>> + .ctl_name = CTL_UNNUMBERED,
>>>> + .procname = "nfs_max_outstanding_writes",
>>>> + .data = &nfs_max_outstanding_writes,
>>>> + .maxlen = sizeof(int),
>>>> + .mode = 0644,
>>>> + .proc_handler = &proc_dointvec,
>>>> + },
>>>> { .ctl_name = 0 }
>>>> };
>>>>
>>>> --- linux-2.6.28.i686/fs/nfs/file.c.org
>>>> +++ linux-2.6.28.i686/fs/nfs/file.c
>>>> @@ -512,11 +512,17 @@ static int nfs_need_sync_write(struct fi
>>>> return 0;
>>>> }
>>>>
>>>> +static int nfs_is_serial(struct inode *inode, loff_t pos)
>>>> +{
>>>> + return NFS_I(inode)->wrpos == pos;
>>>> +}
>>>> +
>>>> static ssize_t nfs_file_write(struct kiocb *iocb, const struct
>>>> iovec *iov,
>>>> unsigned long nr_segs, loff_t pos)
>>>> {
>>>> - struct dentry * dentry = iocb->ki_filp->f_path.dentry;
>>>> - struct inode * inode = dentry->d_inode;
>>>> + struct dentry *dentry = iocb->ki_filp->f_path.dentry;
>>>> + struct inode *inode = dentry->d_inode;
>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>> ssize_t result;
>>>> size_t count = iov_length(iov, nr_segs);
>>>>
>>>> @@ -530,6 +536,13 @@ static ssize_t nfs_file_write(struct kio
>>>> result = -EBUSY;
>>>> if (IS_SWAPFILE(inode))
>>>> goto out_swapfile;
>>>> +
>>>> + result = count;
>>>> + if (!count)
>>>> + goto out;
>>>> +
>>>> + nfs_wait_for_outstanding_writes(inode);
>>>> +
>>>> /*
>>>> * O_APPEND implies that we must revalidate the file length.
>>>> */
>>>> @@ -539,17 +552,22 @@ static ssize_t nfs_file_write(struct kio
>>>> goto out;
>>>> }
>>>>
>>>> - result = count;
>>>> - if (!count)
>>>> - goto out;
>>>> -
>>>> nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
>>>> result = generic_file_aio_write(iocb, iov, nr_segs, pos);
>>>> /* Return error values for O_SYNC and IS_SYNC() */
>>>> - if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
>>>> - int err =
>>>> nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
>>>> - if (err < 0)
>>>> - result = err;
>>>> + if (result >= 0) {
>>>> + if (nfs_need_sync_write(iocb->ki_filp, inode)) {
>>>> + int err;
>>>> + err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp),
>>>> + inode);
>>>> + if (err < 0)
>>>> + result = err;
>>>> + } else if (nfs_max_outstanding_writes &&
>>>> + nfs_is_serial(inode, pos) &&
>>>> + atomic_read(&nfsi->ndirty) >= NFS_SERVER(inode)->wpages)
>>>> + nfs_wb_interim(inode);
>>>> + if (result > 0)
>>>> + nfsi->wrpos = pos + result;
>>>> }
>>>> out:
>>>> return result;
>>>> --- linux-2.6.28.i686/include/linux/nfs_fs.h.org
>>>> +++ linux-2.6.28.i686/include/linux/nfs_fs.h
>>>> @@ -168,6 +168,7 @@ struct nfs_inode {
>>>>
>>>> unsigned long ncommit,
>>>> npages;
>>>> + atomic_t ndirty;
>>>>
>>>> /* Open contexts for shared mmap writes */
>>>> struct list_head open_files;
>>>> @@ -186,6 +187,9 @@ struct nfs_inode {
>>>> fmode_t delegation_state;
>>>> struct rw_semaphore rwsem;
>>>> #endif /* CONFIG_NFS_V4*/
>>>> + atomic_t writes; /* number of outstanding WRITEs */
>>>> + wait_queue_head_t writes_wq;
>>>> + loff_t wrpos; /* position after last WRITE */
>>>> struct inode vfs_inode;
>>>> };
>>>>
>>>> @@ -459,12 +463,14 @@ extern void nfs_unblock_sillyrename(stru
>>>> * linux/fs/nfs/write.c
>>>> */
>>>> extern int nfs_congestion_kb;
>>>> +extern int nfs_max_outstanding_writes;
>>>> extern int nfs_writepage(struct page *page, struct
>>>> writeback_control *wbc);
>>>> extern int nfs_writepages(struct address_space *, struct
>>>> writeback_control *);
>>>> extern int nfs_flush_incompatible(struct file *file, struct page
>>>> *page);
>>>> extern int nfs_updatepage(struct file *, struct page *, unsigned
>>>> int, unsigned int);
>>>> -extern int nfs_writeback_done(struct rpc_task *, struct
>>>> nfs_write_data *);
>>>> +extern int nfs_writeback_done(struct rpc_task *, struct
>>>> nfs_write_data *);
>>>> extern void nfs_writedata_release(void *);
>>>> +extern void nfs_wait_for_outstanding_writes(struct inode *);
>>>>
>>>> /*
>>>> * Try to write back everything synchronously (but check the
>>>> @@ -475,6 +481,7 @@ extern int nfs_wb_all(struct inode *inod
>>>> extern int nfs_wb_nocommit(struct inode *inode);
>>>> extern int nfs_wb_page(struct inode *inode, struct page* page);
>>>> extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
>>>> +extern int nfs_wb_interim(struct inode *);
>>>> #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
>>>> extern int nfs_commit_inode(struct inode *, int);
>>>> extern struct nfs_write_data *nfs_commitdata_alloc(void);
>
> --
> Chuck Lever
> chuck[dot]lever[at]oracle[dot]com


2009-01-23 21:39:07

by Chuck Lever

[permalink] [raw]
Subject: Re: [PATCH] flow control for WRITE requests

On Jan 23, 2009, at Jan 23, 2009, 4:29 PM, Peter Staubach wrote:
> Chuck Lever wrote:
>> On Jan 23, 2009, at Jan 23, 2009, 4:00 PM, Peter Staubach wrote:
>>> Chuck Lever wrote:
>>>> How does this play with memory pressure?
>>>>
>>>> I would expect the generic background page flushers, like pdflush
>>>> and kswapd, could still cause out-of-order writes.
>>>>
>>>
>>> I believe that they all end up calling into the NFS client
>>> code via the nfs_writepages() interface with the writeback
>>> control struct set to write out pages sequentially, possibly
>>> limiting the number of pages to be written, but usually not.
>>>
>>> I ran this code on a 1GB system, writing out 2GB+ files, and
>>> the only out of order issues that I saw were generated by
>>> the pdflush and the application doing the writing itself
>>> intermixing the generation of WRITE requests. This was
>>> addressed by the introduction of NFS_INO_FLUSHING and the
>>> code in nfs_writepages() which uses it.
>>>
>>> I also ran the code on a 4GB system with the changes,
>>> writing the same sized files, andno out of order WRITE
>>> requests were seen.
>>>
>>> I believe that Nick Piggin solved a bunch of these issues
>>> recently and the NFS_INO_FLUSHING patch along with his
>>> patches seems to have resolved the issues.
>>
>> That's good news.
>>
>> How about ^C -- any weirdness there? pages left pinned on the
>> client, fractured writes on the server, application hangs?
>>
>> Have you tried this patch with a multi-threaded write-intensive
>> async I/O workload? I'm sure you know which one I'm referring
>> to. ;-)
>>
>> Would it make sense to key the new nfs_max_outstanding_writes
>> sysctl to the RPC slot table size for that mount point? Another
>> tunable... that doesn't self-tune... Ugh. How did you arrive at
>> the default value of 16?
>>
>
> Sorry, forgot to answer your last question here. Mostly,
> I just picked it out of the air? Other systems default to
> 8, but that seemed a little small, so I figured that 16, 32K
> WRITE requests outstanding should be a reasonable amount of
> outstanding data.
>
> Not scientific at all. I am open to suggestions for
> autotuning, etc.

I'm worried about that piece of it. It sounds like you don't have a
single value that works reasonably well in some common cases. Do you
have any theories about why 2 was better on the big memory client?

Also, do you think this value is dependent on wsize in any way? My
systems all use wsize=1M, as that is what Linux and Solaris (all the
server types I have here) support.

Naturally it doesn't have to be set to a single constant; we could
have a heuristic that sets a decent default value instead.

But 16 is the same size as the RPC slot table. I wonder if some
workloads would benefit or would be hurt by allowing the slot table to
be filled completely with writes. Some testing with larger (or
smaller) slot tables might be illuminating. I wonder if a per-mount
or even a per-file setting would be more useful than a single setting
for the whole client. But I don't have a feel for what makes one
value better than another.

> Thanx...
>
> ps
>
>> CITI had been trying to improve write performance on transatlantic
>> 10Gb links. Would be useful to test in that environment.
>>
>>> Thanx...
>>>
>>> ps
>>>
>>>> On Jan 23, 2009, at Jan 23, 2009, 2:10 PM, Peter Staubach wrote:
>>>>
>>>>> Hi.
>>>>>
>>>>> Attached is a patch which implements some flow control for the
>>>>> NFS client to control dirty pages. The flow control is
>>>>> implemented on a per-file basis and causes dirty pages to be
>>>>> written out when the client can detect that the application is
>>>>> writing in a serial fashion and has dirtied enough pages to
>>>>> fill a complete over the wire transfer.
>>>>>
>>>>> This work was precipitated by working on a situation where a
>>>>> server at a customer site was not able to adequately handle
>>>>> the behavior of the Linux NFS client. This particular server
>>>>> required that all data to the file written to the file be
>>>>> written in a strictly serial fashion. It also had problems
>>>>> handling the Linux NFS client semantic of caching a large
>>>>> amount of data and then sending out that data all at once.
>>>>>
>>>>> The sequential ordering problem was resolved by a previous
>>>>> patch which was submitted to the linux-nfs list. This patch
>>>>> addresses the capacity problem.
>>>>>
>>>>> The problem is resolved by sending WRITE requests much
>>>>> earlier in the process of the application writing to the file.
>>>>> The client keeps track of the number of dirty pages associated
>>>>> with the file and also the last offset of the data being
>>>>> written. When the client detects that a full over the wire
>>>>> transfer could be constructed and that the application is
>>>>> writing sequentially, then it generates an UNSTABLE write to
>>>>> server for the currently dirty data.
>>>>>
>>>>> The client also keeps track of the number of these WRITE
>>>>> requests which have been generated. It flow controls based
>>>>> on a configurable maximum. This keeps the client from
>>>>> completely overwhelming the server.
>>>>>
>>>>> A nice side effect of the framework is that the issue of
>>>>> stat()'ing a file being written can be handled much more
>>>>> quickly than before. The amount of data that must be
>>>>> transmitted to the server to satisfy the "latest mtime"
>>>>> requirement is limited. Also, the application writing to
>>>>> the file is blocked until the over the wire GETATTR is
>>>>> completed. This allows the GETATTR to be send and the
>>>>> response received without competing with the data being
>>>>> written.
>>>>>
>>>>> I did not do formal performance testing, but in my
>>>>> watching testing, I did not see any performance regressions.
>>>>>
>>>>> As a side note -- the more natural model of flow control
>>>>> would seem to be at the client/server level instead of
>>>>> the per-file level. However, that level was too coarse
>>>>> with the particular server that was required to be used
>>>>> because its requirements were at the per-file level.
>>>>>
>>>>> Thanx...
>>>>>
>>>>> ps
>>>>>
>>>>> Signed-off-by: Peter Staubach <[email protected]>
>>>>> --- linux-2.6.28.i686/fs/nfs/inode.c.org
>>>>> +++ linux-2.6.28.i686/fs/nfs/inode.c
>>>>> @@ -486,8 +486,10 @@ void nfs_setattr_update_inode(struct ino
>>>>> int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
>>>>> struct kstat *stat)
>>>>> {
>>>>> struct inode *inode = dentry->d_inode;
>>>>> - int need_atime = NFS_I(inode)->cache_validity &
>>>>> NFS_INO_INVALID_ATIME;
>>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>>> + int need_atime = nfsi->cache_validity &
>>>>> NFS_INO_INVALID_ATIME;
>>>>> int err;
>>>>> + int inc_outstanding_writes = nfs_max_outstanding_writes;
>>>>>
>>>>> /*
>>>>> * Flush out writes to the server in order to update c/mtime.
>>>>> @@ -497,9 +499,8 @@ int nfs_getattr(struct vfsmount *mnt, st
>>>>> * nfs_wb_nocommit.
>>>>> */
>>>>> if (S_ISREG(inode->i_mode)) {
>>>>> - mutex_lock(&inode->i_mutex);
>>>>> + atomic_add(inc_outstanding_writes, &nfsi->writes);
>>>>> nfs_wb_nocommit(inode);
>>>>> - mutex_unlock(&inode->i_mutex);
>>>>> }
>>>>>
>>>>> /*
>>>>> @@ -523,6 +524,10 @@ int nfs_getattr(struct vfsmount *mnt, st
>>>>> generic_fillattr(inode, stat);
>>>>> stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
>>>>> }
>>>>> + if (S_ISREG(inode->i_mode)) {
>>>>> + atomic_sub(inc_outstanding_writes, &nfsi->writes);
>>>>> + wake_up(&nfsi->writes_wq);
>>>>> + }
>>>>> return err;
>>>>> }
>>>>>
>>>>> @@ -1288,9 +1293,13 @@ static void init_once(void *foo)
>>>>> INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
>>>>> nfsi->ncommit = 0;
>>>>> nfsi->npages = 0;
>>>>> + atomic_set(&nfsi->ndirty, 0);
>>>>> atomic_set(&nfsi->silly_count, 1);
>>>>> INIT_HLIST_HEAD(&nfsi->silly_list);
>>>>> init_waitqueue_head(&nfsi->waitqueue);
>>>>> + atomic_set(&nfsi->writes, 0);
>>>>> + init_waitqueue_head(&nfsi->writes_wq);
>>>>> + nfsi->wrpos = 0;
>>>>> nfs4_init_once(nfsi);
>>>>> }
>>>>>
>>>>> --- linux-2.6.28.i686/fs/nfs/write.c.org
>>>>> +++ linux-2.6.28.i686/fs/nfs/write.c
>>>>> @@ -197,7 +197,9 @@ static int nfs_set_page_writeback(struct
>>>>> if (!ret) {
>>>>> struct inode *inode = page->mapping->host;
>>>>> struct nfs_server *nfss = NFS_SERVER(inode);
>>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>>>
>>>>> + atomic_dec(&nfsi->ndirty);
>>>>> if (atomic_long_inc_return(&nfss->writeback) >
>>>>> NFS_CONGESTION_ON_THRESH)
>>>>> set_bdi_congested(&nfss->backing_dev_info, WRITE);
>>>>> @@ -310,6 +312,33 @@ static int nfs_writepages_callback(struc
>>>>> return ret;
>>>>> }
>>>>>
>>>>> +int nfs_max_outstanding_writes = 16;
>>>>> +
>>>>> +static void nfs_inc_outstanding_writes(struct inode *inode)
>>>>> +{
>>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>>> +
>>>>> + atomic_inc(&nfsi->writes);
>>>>> +}
>>>>> +
>>>>> +static void nfs_dec_outstanding_writes(struct inode *inode)
>>>>> +{
>>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>>> +
>>>>> + if (atomic_dec_return(&nfsi->writes) <
>>>>> nfs_max_outstanding_writes)
>>>>> + wake_up(&nfsi->writes_wq);
>>>>> +}
>>>>> +
>>>>> +void nfs_wait_for_outstanding_writes(struct inode *inode)
>>>>> +{
>>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>>> +
>>>>> + if (nfs_max_outstanding_writes) {
>>>>> + wait_event(nfsi->writes_wq,
>>>>> + atomic_read(&nfsi->writes) <
>>>>> nfs_max_outstanding_writes);
>>>>> + }
>>>>> +}
>>>>> +
>>>>> int nfs_writepages(struct address_space *mapping, struct
>>>>> writeback_control *wbc)
>>>>> {
>>>>> struct inode *inode = mapping->host;
>>>>> @@ -369,6 +398,7 @@ static int nfs_inode_add_request(struct
>>>>> SetPagePrivate(req->wb_page);
>>>>> set_page_private(req->wb_page, (unsigned long)req);
>>>>> nfsi->npages++;
>>>>> + atomic_inc(&nfsi->ndirty);
>>>>> kref_get(&req->wb_kref);
>>>>> radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
>>>>> NFS_PAGE_TAG_LOCKED);
>>>>> @@ -405,6 +435,10 @@ static void nfs_inode_remove_request(str
>>>>> static void
>>>>> nfs_mark_request_dirty(struct nfs_page *req)
>>>>> {
>>>>> + struct inode *inode = req->wb_context->path.dentry->d_inode;
>>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>>> +
>>>>> + atomic_inc(&nfsi->ndirty);
>>>>> __set_page_dirty_nobuffers(req->wb_page);
>>>>> }
>>>>>
>>>>> @@ -633,6 +667,7 @@ static struct nfs_page *nfs_try_to_updat
>>>>> req->wb_bytes = end - req->wb_offset;
>>>>> else
>>>>> req->wb_bytes = rqend - req->wb_offset;
>>>>> + atomic_inc(&NFS_I(inode)->ndirty);
>>>>> out_unlock:
>>>>> spin_unlock(&inode->i_lock);
>>>>> return req;
>>>>> @@ -855,6 +890,8 @@ static int nfs_write_rpcsetup(struct nfs
>>>>> count,
>>>>> (unsigned long long)data->args.offset);
>>>>>
>>>>> + nfs_inc_outstanding_writes(inode);
>>>>> +
>>>>> task = rpc_run_task(&task_setup_data);
>>>>> if (IS_ERR(task))
>>>>> return PTR_ERR(task);
>>>>> @@ -1130,7 +1167,7 @@ int nfs_writeback_done(struct rpc_task *
>>>>> */
>>>>> status = NFS_PROTO(data->inode)->write_done(task, data);
>>>>> if (status != 0)
>>>>> - return status;
>>>>> + goto out;
>>>>> nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp-
>>>>> >count);
>>>>>
>>>>> #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
>>>>> @@ -1186,7 +1223,9 @@ int nfs_writeback_done(struct rpc_task *
>>>>> /* Can't do anything about it except throw an error. */
>>>>> task->tk_status = -EIO;
>>>>> }
>>>>> - return 0;
>>>>> +out:
>>>>> + nfs_dec_outstanding_writes(data->inode);
>>>>> + return status;
>>>>> }
>>>>>
>>>>>
>>>>> @@ -1546,6 +1585,29 @@ int nfs_wb_page(struct inode *inode, str
>>>>> return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
>>>>> }
>>>>>
>>>>> +/*
>>>>> + * Start the WRITE requests for dirty pages on their way.
>>>>> + * This is used when a sufficient number of dirty pages
>>>>> + * have accumulated.
>>>>> + */
>>>>> +int nfs_wb_interim(struct inode *inode)
>>>>> +{
>>>>> + struct address_space *mapping = inode->i_mapping;
>>>>> + struct writeback_control wbc = {
>>>>> + .bdi = mapping->backing_dev_info,
>>>>> + .sync_mode = WB_SYNC_NONE,
>>>>> + .nr_to_write = LONG_MAX,
>>>>> + .range_start = 0,
>>>>> + .range_end = LLONG_MAX,
>>>>> + };
>>>>> + int ret;
>>>>> +
>>>>> + ret = nfs_writepages(mapping, &wbc);
>>>>> + if (ret < 0)
>>>>> + __mark_inode_dirty(inode, I_DIRTY_PAGES);
>>>>> + return ret;
>>>>> +}
>>>>> +
>>>>> int __init nfs_init_writepagecache(void)
>>>>> {
>>>>> nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
>>>>> --- linux-2.6.28.i686/fs/nfs/sysctl.c.org
>>>>> +++ linux-2.6.28.i686/fs/nfs/sysctl.c
>>>>> @@ -58,6 +58,14 @@ static ctl_table nfs_cb_sysctls[] = {
>>>>> .mode = 0644,
>>>>> .proc_handler = &proc_dointvec,
>>>>> },
>>>>> + {
>>>>> + .ctl_name = CTL_UNNUMBERED,
>>>>> + .procname = "nfs_max_outstanding_writes",
>>>>> + .data = &nfs_max_outstanding_writes,
>>>>> + .maxlen = sizeof(int),
>>>>> + .mode = 0644,
>>>>> + .proc_handler = &proc_dointvec,
>>>>> + },
>>>>> { .ctl_name = 0 }
>>>>> };
>>>>>
>>>>> --- linux-2.6.28.i686/fs/nfs/file.c.org
>>>>> +++ linux-2.6.28.i686/fs/nfs/file.c
>>>>> @@ -512,11 +512,17 @@ static int nfs_need_sync_write(struct fi
>>>>> return 0;
>>>>> }
>>>>>
>>>>> +static int nfs_is_serial(struct inode *inode, loff_t pos)
>>>>> +{
>>>>> + return NFS_I(inode)->wrpos == pos;
>>>>> +}
>>>>> +
>>>>> static ssize_t nfs_file_write(struct kiocb *iocb, const struct
>>>>> iovec *iov,
>>>>> unsigned long nr_segs, loff_t pos)
>>>>> {
>>>>> - struct dentry * dentry = iocb->ki_filp->f_path.dentry;
>>>>> - struct inode * inode = dentry->d_inode;
>>>>> + struct dentry *dentry = iocb->ki_filp->f_path.dentry;
>>>>> + struct inode *inode = dentry->d_inode;
>>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>>> ssize_t result;
>>>>> size_t count = iov_length(iov, nr_segs);
>>>>>
>>>>> @@ -530,6 +536,13 @@ static ssize_t nfs_file_write(struct kio
>>>>> result = -EBUSY;
>>>>> if (IS_SWAPFILE(inode))
>>>>> goto out_swapfile;
>>>>> +
>>>>> + result = count;
>>>>> + if (!count)
>>>>> + goto out;
>>>>> +
>>>>> + nfs_wait_for_outstanding_writes(inode);
>>>>> +
>>>>> /*
>>>>> * O_APPEND implies that we must revalidate the file length.
>>>>> */
>>>>> @@ -539,17 +552,22 @@ static ssize_t nfs_file_write(struct kio
>>>>> goto out;
>>>>> }
>>>>>
>>>>> - result = count;
>>>>> - if (!count)
>>>>> - goto out;
>>>>> -
>>>>> nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
>>>>> result = generic_file_aio_write(iocb, iov, nr_segs, pos);
>>>>> /* Return error values for O_SYNC and IS_SYNC() */
>>>>> - if (result >= 0 && nfs_need_sync_write(iocb->ki_filp,
>>>>> inode)) {
>>>>> - int err = nfs_do_fsync(nfs_file_open_context(iocb-
>>>>> >ki_filp), inode);
>>>>> - if (err < 0)
>>>>> - result = err;
>>>>> + if (result >= 0) {
>>>>> + if (nfs_need_sync_write(iocb->ki_filp, inode)) {
>>>>> + int err;
>>>>> + err = nfs_do_fsync(nfs_file_open_context(iocb-
>>>>> >ki_filp),
>>>>> + inode);
>>>>> + if (err < 0)
>>>>> + result = err;
>>>>> + } else if (nfs_max_outstanding_writes &&
>>>>> + nfs_is_serial(inode, pos) &&
>>>>> + atomic_read(&nfsi->ndirty) >= NFS_SERVER(inode)-
>>>>> >wpages)
>>>>> + nfs_wb_interim(inode);
>>>>> + if (result > 0)
>>>>> + nfsi->wrpos = pos + result;
>>>>> }
>>>>> out:
>>>>> return result;
>>>>> --- linux-2.6.28.i686/include/linux/nfs_fs.h.org
>>>>> +++ linux-2.6.28.i686/include/linux/nfs_fs.h
>>>>> @@ -168,6 +168,7 @@ struct nfs_inode {
>>>>>
>>>>> unsigned long ncommit,
>>>>> npages;
>>>>> + atomic_t ndirty;
>>>>>
>>>>> /* Open contexts for shared mmap writes */
>>>>> struct list_head open_files;
>>>>> @@ -186,6 +187,9 @@ struct nfs_inode {
>>>>> fmode_t delegation_state;
>>>>> struct rw_semaphore rwsem;
>>>>> #endif /* CONFIG_NFS_V4*/
>>>>> + atomic_t writes; /* number of outstanding WRITEs */
>>>>> + wait_queue_head_t writes_wq;
>>>>> + loff_t wrpos; /* position after last WRITE */
>>>>> struct inode vfs_inode;
>>>>> };
>>>>>
>>>>> @@ -459,12 +463,14 @@ extern void nfs_unblock_sillyrename(stru
>>>>> * linux/fs/nfs/write.c
>>>>> */
>>>>> extern int nfs_congestion_kb;
>>>>> +extern int nfs_max_outstanding_writes;
>>>>> extern int nfs_writepage(struct page *page, struct
>>>>> writeback_control *wbc);
>>>>> extern int nfs_writepages(struct address_space *, struct
>>>>> writeback_control *);
>>>>> extern int nfs_flush_incompatible(struct file *file, struct
>>>>> page *page);
>>>>> extern int nfs_updatepage(struct file *, struct page *,
>>>>> unsigned int, unsigned int);
>>>>> -extern int nfs_writeback_done(struct rpc_task *, struct
>>>>> nfs_write_data *);
>>>>> +extern int nfs_writeback_done(struct rpc_task *, struct
>>>>> nfs_write_data *);
>>>>> extern void nfs_writedata_release(void *);
>>>>> +extern void nfs_wait_for_outstanding_writes(struct inode *);
>>>>>
>>>>> /*
>>>>> * Try to write back everything synchronously (but check the
>>>>> @@ -475,6 +481,7 @@ extern int nfs_wb_all(struct inode *inod
>>>>> extern int nfs_wb_nocommit(struct inode *inode);
>>>>> extern int nfs_wb_page(struct inode *inode, struct page* page);
>>>>> extern int nfs_wb_page_cancel(struct inode *inode, struct page*
>>>>> page);
>>>>> +extern int nfs_wb_interim(struct inode *);
>>>>> #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
>>>>> extern int nfs_commit_inode(struct inode *, int);
>>>>> extern struct nfs_write_data *nfs_commitdata_alloc(void);
>>
>> --
>> Chuck Lever
>> chuck[dot]lever[at]oracle[dot]com
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs"
> in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com





2009-01-23 21:49:54

by Peter Staubach

[permalink] [raw]
Subject: Re: [PATCH] flow control for WRITE requests

Chuck Lever wrote:
> On Jan 23, 2009, at Jan 23, 2009, 4:29 PM, Peter Staubach wrote:
>> Chuck Lever wrote:
>>> On Jan 23, 2009, at Jan 23, 2009, 4:00 PM, Peter Staubach wrote:
>>>> Chuck Lever wrote:
>>>>> How does this play with memory pressure?
>>>>>
>>>>> I would expect the generic background page flushers, like pdflush
>>>>> and kswapd, could still cause out-of-order writes.
>>>>>
>>>>
>>>> I believe that they all end up calling into the NFS client
>>>> code via the nfs_writepages() interface with the writeback
>>>> control struct set to write out pages sequentially, possibly
>>>> limiting the number of pages to be written, but usually not.
>>>>
>>>> I ran this code on a 1GB system, writing out 2GB+ files, and
>>>> the only out of order issues that I saw were generated by
>>>> the pdflush and the application doing the writing itself
>>>> intermixing the generation of WRITE requests. This was
>>>> addressed by the introduction of NFS_INO_FLUSHING and the
>>>> code in nfs_writepages() which uses it.
>>>>
>>>> I also ran the code on a 4GB system with the changes,
>>>> writing the same sized files, andno out of order WRITE
>>>> requests were seen.
>>>>
>>>> I believe that Nick Piggin solved a bunch of these issues
>>>> recently and the NFS_INO_FLUSHING patch along with his
>>>> patches seems to have resolved the issues.
>>>
>>> That's good news.
>>>
>>> How about ^C -- any weirdness there? pages left pinned on the
>>> client, fractured writes on the server, application hangs?
>>>
>>> Have you tried this patch with a multi-threaded write-intensive
>>> async I/O workload? I'm sure you know which one I'm referring to. ;-)
>>>
>>> Would it make sense to key the new nfs_max_outstanding_writes sysctl
>>> to the RPC slot table size for that mount point? Another tunable...
>>> that doesn't self-tune... Ugh. How did you arrive at the default
>>> value of 16?
>>>
>>
>> Sorry, forgot to answer your last question here. Mostly,
>> I just picked it out of the air? Other systems default to
>> 8, but that seemed a little small, so I figured that 16, 32K
>> WRITE requests outstanding should be a reasonable amount of
>> outstanding data.
>>
>> Not scientific at all. I am open to suggestions for
>> autotuning, etc.
>
> I'm worried about that piece of it. It sounds like you don't have a
> single value that works reasonably well in some common cases. Do you
> have any theories about why 2 was better on the big memory client?
>

Well, 16 seems to work well on all of my systems here which
are running the patch and the RHEL systems that are running
their versions of the patch.

The limitation had more to do with the server. I couldn't
see the server or spend any time on it, but the descriptions
indicated that it was the bottleneck. It had per-file
WRITE caches that weren't very big and it also had a fair
amount of CPU intensive processing that it had to do to
each WRITE request.

> Also, do you think this value is dependent on wsize in any way? My
> systems all use wsize=1M, as that is what Linux and Solaris (all the
> server types I have here) support.
>

Maybe, but not really. This limit works in addition to the
normal system BDI things, so if the system can't take 16 times
the wsize cached before writing, then pdflush or some such
daemon will kick in and push out the pages first.

The limit is not enforced at the spot where WRITE RPC requests
are generated. It is more to slow down the application once
there are sufficient WRITE requests outstanding to better try
to balance the dirtying of pages with the cleaning of pages.


> Naturally it doesn't have to be set to a single constant; we could
> have a heuristic that sets a decent default value instead.
>
> But 16 is the same size as the RPC slot table. I wonder if some
> workloads would benefit or would be hurt by allowing the slot table to
> be filled completely with writes. Some testing with larger (or
> smaller) slot tables might be illuminating. I wonder if a per-mount
> or even a per-file setting would be more useful than a single setting
> for the whole client. But I don't have a feel for what makes one
> value better than another.
>

This could be, but I hesitate to make this too obviously
configurable. It is my hope (and belief) that most people
will not have to ever mess with this thing.

Something to think about would be to use 2 TCP connections,
one for bulk data movement and the other for small things,
so that we don't get so-called head of line blocking.

ps

>> Thanx...
>>
>> ps
>>
>>> CITI had been trying to improve write performance on transatlantic
>>> 10Gb links. Would be useful to test in that environment.
>>>
>>>> Thanx...
>>>>
>>>> ps
>>>>
>>>>> On Jan 23, 2009, at Jan 23, 2009, 2:10 PM, Peter Staubach wrote:
>>>>>
>>>>>> Hi.
>>>>>>
>>>>>> Attached is a patch which implements some flow control for the
>>>>>> NFS client to control dirty pages. The flow control is
>>>>>> implemented on a per-file basis and causes dirty pages to be
>>>>>> written out when the client can detect that the application is
>>>>>> writing in a serial fashion and has dirtied enough pages to
>>>>>> fill a complete over the wire transfer.
>>>>>>
>>>>>> This work was precipitated by working on a situation where a
>>>>>> server at a customer site was not able to adequately handle
>>>>>> the behavior of the Linux NFS client. This particular server
>>>>>> required that all data to the file written to the file be
>>>>>> written in a strictly serial fashion. It also had problems
>>>>>> handling the Linux NFS client semantic of caching a large
>>>>>> amount of data and then sending out that data all at once.
>>>>>>
>>>>>> The sequential ordering problem was resolved by a previous
>>>>>> patch which was submitted to the linux-nfs list. This patch
>>>>>> addresses the capacity problem.
>>>>>>
>>>>>> The problem is resolved by sending WRITE requests much
>>>>>> earlier in the process of the application writing to the file.
>>>>>> The client keeps track of the number of dirty pages associated
>>>>>> with the file and also the last offset of the data being
>>>>>> written. When the client detects that a full over the wire
>>>>>> transfer could be constructed and that the application is
>>>>>> writing sequentially, then it generates an UNSTABLE write to
>>>>>> server for the currently dirty data.
>>>>>>
>>>>>> The client also keeps track of the number of these WRITE
>>>>>> requests which have been generated. It flow controls based
>>>>>> on a configurable maximum. This keeps the client from
>>>>>> completely overwhelming the server.
>>>>>>
>>>>>> A nice side effect of the framework is that the issue of
>>>>>> stat()'ing a file being written can be handled much more
>>>>>> quickly than before. The amount of data that must be
>>>>>> transmitted to the server to satisfy the "latest mtime"
>>>>>> requirement is limited. Also, the application writing to
>>>>>> the file is blocked until the over the wire GETATTR is
>>>>>> completed. This allows the GETATTR to be send and the
>>>>>> response received without competing with the data being
>>>>>> written.
>>>>>>
>>>>>> I did not do formal performance testing, but in my
>>>>>> watching testing, I did not see any performance regressions.
>>>>>>
>>>>>> As a side note -- the more natural model of flow control
>>>>>> would seem to be at the client/server level instead of
>>>>>> the per-file level. However, that level was too coarse
>>>>>> with the particular server that was required to be used
>>>>>> because its requirements were at the per-file level.
>>>>>>
>>>>>> Thanx...
>>>>>>
>>>>>> ps
>>>>>>
>>>>>> Signed-off-by: Peter Staubach <[email protected]>
>>>>>> --- linux-2.6.28.i686/fs/nfs/inode.c.org
>>>>>> +++ linux-2.6.28.i686/fs/nfs/inode.c
>>>>>> @@ -486,8 +486,10 @@ void nfs_setattr_update_inode(struct ino
>>>>>> int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
>>>>>> struct kstat *stat)
>>>>>> {
>>>>>> struct inode *inode = dentry->d_inode;
>>>>>> - int need_atime = NFS_I(inode)->cache_validity &
>>>>>> NFS_INO_INVALID_ATIME;
>>>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>>>> + int need_atime = nfsi->cache_validity & NFS_INO_INVALID_ATIME;
>>>>>> int err;
>>>>>> + int inc_outstanding_writes = nfs_max_outstanding_writes;
>>>>>>
>>>>>> /*
>>>>>> * Flush out writes to the server in order to update c/mtime.
>>>>>> @@ -497,9 +499,8 @@ int nfs_getattr(struct vfsmount *mnt, st
>>>>>> * nfs_wb_nocommit.
>>>>>> */
>>>>>> if (S_ISREG(inode->i_mode)) {
>>>>>> - mutex_lock(&inode->i_mutex);
>>>>>> + atomic_add(inc_outstanding_writes, &nfsi->writes);
>>>>>> nfs_wb_nocommit(inode);
>>>>>> - mutex_unlock(&inode->i_mutex);
>>>>>> }
>>>>>>
>>>>>> /*
>>>>>> @@ -523,6 +524,10 @@ int nfs_getattr(struct vfsmount *mnt, st
>>>>>> generic_fillattr(inode, stat);
>>>>>> stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
>>>>>> }
>>>>>> + if (S_ISREG(inode->i_mode)) {
>>>>>> + atomic_sub(inc_outstanding_writes, &nfsi->writes);
>>>>>> + wake_up(&nfsi->writes_wq);
>>>>>> + }
>>>>>> return err;
>>>>>> }
>>>>>>
>>>>>> @@ -1288,9 +1293,13 @@ static void init_once(void *foo)
>>>>>> INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
>>>>>> nfsi->ncommit = 0;
>>>>>> nfsi->npages = 0;
>>>>>> + atomic_set(&nfsi->ndirty, 0);
>>>>>> atomic_set(&nfsi->silly_count, 1);
>>>>>> INIT_HLIST_HEAD(&nfsi->silly_list);
>>>>>> init_waitqueue_head(&nfsi->waitqueue);
>>>>>> + atomic_set(&nfsi->writes, 0);
>>>>>> + init_waitqueue_head(&nfsi->writes_wq);
>>>>>> + nfsi->wrpos = 0;
>>>>>> nfs4_init_once(nfsi);
>>>>>> }
>>>>>>
>>>>>> --- linux-2.6.28.i686/fs/nfs/write.c.org
>>>>>> +++ linux-2.6.28.i686/fs/nfs/write.c
>>>>>> @@ -197,7 +197,9 @@ static int nfs_set_page_writeback(struct
>>>>>> if (!ret) {
>>>>>> struct inode *inode = page->mapping->host;
>>>>>> struct nfs_server *nfss = NFS_SERVER(inode);
>>>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>>>>
>>>>>> + atomic_dec(&nfsi->ndirty);
>>>>>> if (atomic_long_inc_return(&nfss->writeback) >
>>>>>> NFS_CONGESTION_ON_THRESH)
>>>>>> set_bdi_congested(&nfss->backing_dev_info, WRITE);
>>>>>> @@ -310,6 +312,33 @@ static int nfs_writepages_callback(struc
>>>>>> return ret;
>>>>>> }
>>>>>>
>>>>>> +int nfs_max_outstanding_writes = 16;
>>>>>> +
>>>>>> +static void nfs_inc_outstanding_writes(struct inode *inode)
>>>>>> +{
>>>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>>>> +
>>>>>> + atomic_inc(&nfsi->writes);
>>>>>> +}
>>>>>> +
>>>>>> +static void nfs_dec_outstanding_writes(struct inode *inode)
>>>>>> +{
>>>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>>>> +
>>>>>> + if (atomic_dec_return(&nfsi->writes) <
>>>>>> nfs_max_outstanding_writes)
>>>>>> + wake_up(&nfsi->writes_wq);
>>>>>> +}
>>>>>> +
>>>>>> +void nfs_wait_for_outstanding_writes(struct inode *inode)
>>>>>> +{
>>>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>>>> +
>>>>>> + if (nfs_max_outstanding_writes) {
>>>>>> + wait_event(nfsi->writes_wq,
>>>>>> + atomic_read(&nfsi->writes) <
>>>>>> nfs_max_outstanding_writes);
>>>>>> + }
>>>>>> +}
>>>>>> +
>>>>>> int nfs_writepages(struct address_space *mapping, struct
>>>>>> writeback_control *wbc)
>>>>>> {
>>>>>> struct inode *inode = mapping->host;
>>>>>> @@ -369,6 +398,7 @@ static int nfs_inode_add_request(struct
>>>>>> SetPagePrivate(req->wb_page);
>>>>>> set_page_private(req->wb_page, (unsigned long)req);
>>>>>> nfsi->npages++;
>>>>>> + atomic_inc(&nfsi->ndirty);
>>>>>> kref_get(&req->wb_kref);
>>>>>> radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
>>>>>> NFS_PAGE_TAG_LOCKED);
>>>>>> @@ -405,6 +435,10 @@ static void nfs_inode_remove_request(str
>>>>>> static void
>>>>>> nfs_mark_request_dirty(struct nfs_page *req)
>>>>>> {
>>>>>> + struct inode *inode = req->wb_context->path.dentry->d_inode;
>>>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>>>> +
>>>>>> + atomic_inc(&nfsi->ndirty);
>>>>>> __set_page_dirty_nobuffers(req->wb_page);
>>>>>> }
>>>>>>
>>>>>> @@ -633,6 +667,7 @@ static struct nfs_page *nfs_try_to_updat
>>>>>> req->wb_bytes = end - req->wb_offset;
>>>>>> else
>>>>>> req->wb_bytes = rqend - req->wb_offset;
>>>>>> + atomic_inc(&NFS_I(inode)->ndirty);
>>>>>> out_unlock:
>>>>>> spin_unlock(&inode->i_lock);
>>>>>> return req;
>>>>>> @@ -855,6 +890,8 @@ static int nfs_write_rpcsetup(struct nfs
>>>>>> count,
>>>>>> (unsigned long long)data->args.offset);
>>>>>>
>>>>>> + nfs_inc_outstanding_writes(inode);
>>>>>> +
>>>>>> task = rpc_run_task(&task_setup_data);
>>>>>> if (IS_ERR(task))
>>>>>> return PTR_ERR(task);
>>>>>> @@ -1130,7 +1167,7 @@ int nfs_writeback_done(struct rpc_task *
>>>>>> */
>>>>>> status = NFS_PROTO(data->inode)->write_done(task, data);
>>>>>> if (status != 0)
>>>>>> - return status;
>>>>>> + goto out;
>>>>>> nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES,
>>>>>> resp->count);
>>>>>>
>>>>>> #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
>>>>>> @@ -1186,7 +1223,9 @@ int nfs_writeback_done(struct rpc_task *
>>>>>> /* Can't do anything about it except throw an error. */
>>>>>> task->tk_status = -EIO;
>>>>>> }
>>>>>> - return 0;
>>>>>> +out:
>>>>>> + nfs_dec_outstanding_writes(data->inode);
>>>>>> + return status;
>>>>>> }
>>>>>>
>>>>>>
>>>>>> @@ -1546,6 +1585,29 @@ int nfs_wb_page(struct inode *inode, str
>>>>>> return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
>>>>>> }
>>>>>>
>>>>>> +/*
>>>>>> + * Start the WRITE requests for dirty pages on their way.
>>>>>> + * This is used when a sufficient number of dirty pages
>>>>>> + * have accumulated.
>>>>>> + */
>>>>>> +int nfs_wb_interim(struct inode *inode)
>>>>>> +{
>>>>>> + struct address_space *mapping = inode->i_mapping;
>>>>>> + struct writeback_control wbc = {
>>>>>> + .bdi = mapping->backing_dev_info,
>>>>>> + .sync_mode = WB_SYNC_NONE,
>>>>>> + .nr_to_write = LONG_MAX,
>>>>>> + .range_start = 0,
>>>>>> + .range_end = LLONG_MAX,
>>>>>> + };
>>>>>> + int ret;
>>>>>> +
>>>>>> + ret = nfs_writepages(mapping, &wbc);
>>>>>> + if (ret < 0)
>>>>>> + __mark_inode_dirty(inode, I_DIRTY_PAGES);
>>>>>> + return ret;
>>>>>> +}
>>>>>> +
>>>>>> int __init nfs_init_writepagecache(void)
>>>>>> {
>>>>>> nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
>>>>>> --- linux-2.6.28.i686/fs/nfs/sysctl.c.org
>>>>>> +++ linux-2.6.28.i686/fs/nfs/sysctl.c
>>>>>> @@ -58,6 +58,14 @@ static ctl_table nfs_cb_sysctls[] = {
>>>>>> .mode = 0644,
>>>>>> .proc_handler = &proc_dointvec,
>>>>>> },
>>>>>> + {
>>>>>> + .ctl_name = CTL_UNNUMBERED,
>>>>>> + .procname = "nfs_max_outstanding_writes",
>>>>>> + .data = &nfs_max_outstanding_writes,
>>>>>> + .maxlen = sizeof(int),
>>>>>> + .mode = 0644,
>>>>>> + .proc_handler = &proc_dointvec,
>>>>>> + },
>>>>>> { .ctl_name = 0 }
>>>>>> };
>>>>>>
>>>>>> --- linux-2.6.28.i686/fs/nfs/file.c.org
>>>>>> +++ linux-2.6.28.i686/fs/nfs/file.c
>>>>>> @@ -512,11 +512,17 @@ static int nfs_need_sync_write(struct fi
>>>>>> return 0;
>>>>>> }
>>>>>>
>>>>>> +static int nfs_is_serial(struct inode *inode, loff_t pos)
>>>>>> +{
>>>>>> + return NFS_I(inode)->wrpos == pos;
>>>>>> +}
>>>>>> +
>>>>>> static ssize_t nfs_file_write(struct kiocb *iocb, const struct
>>>>>> iovec *iov,
>>>>>> unsigned long nr_segs, loff_t pos)
>>>>>> {
>>>>>> - struct dentry * dentry = iocb->ki_filp->f_path.dentry;
>>>>>> - struct inode * inode = dentry->d_inode;
>>>>>> + struct dentry *dentry = iocb->ki_filp->f_path.dentry;
>>>>>> + struct inode *inode = dentry->d_inode;
>>>>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>>>>> ssize_t result;
>>>>>> size_t count = iov_length(iov, nr_segs);
>>>>>>
>>>>>> @@ -530,6 +536,13 @@ static ssize_t nfs_file_write(struct kio
>>>>>> result = -EBUSY;
>>>>>> if (IS_SWAPFILE(inode))
>>>>>> goto out_swapfile;
>>>>>> +
>>>>>> + result = count;
>>>>>> + if (!count)
>>>>>> + goto out;
>>>>>> +
>>>>>> + nfs_wait_for_outstanding_writes(inode);
>>>>>> +
>>>>>> /*
>>>>>> * O_APPEND implies that we must revalidate the file length.
>>>>>> */
>>>>>> @@ -539,17 +552,22 @@ static ssize_t nfs_file_write(struct kio
>>>>>> goto out;
>>>>>> }
>>>>>>
>>>>>> - result = count;
>>>>>> - if (!count)
>>>>>> - goto out;
>>>>>> -
>>>>>> nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
>>>>>> result = generic_file_aio_write(iocb, iov, nr_segs, pos);
>>>>>> /* Return error values for O_SYNC and IS_SYNC() */
>>>>>> - if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
>>>>>> - int err =
>>>>>> nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
>>>>>> - if (err < 0)
>>>>>> - result = err;
>>>>>> + if (result >= 0) {
>>>>>> + if (nfs_need_sync_write(iocb->ki_filp, inode)) {
>>>>>> + int err;
>>>>>> + err =
>>>>>> nfs_do_fsync(nfs_file_open_context(iocb->ki_filp),
>>>>>> + inode);
>>>>>> + if (err < 0)
>>>>>> + result = err;
>>>>>> + } else if (nfs_max_outstanding_writes &&
>>>>>> + nfs_is_serial(inode, pos) &&
>>>>>> + atomic_read(&nfsi->ndirty) >=
>>>>>> NFS_SERVER(inode)->wpages)
>>>>>> + nfs_wb_interim(inode);
>>>>>> + if (result > 0)
>>>>>> + nfsi->wrpos = pos + result;
>>>>>> }
>>>>>> out:
>>>>>> return result;
>>>>>> --- linux-2.6.28.i686/include/linux/nfs_fs.h.org
>>>>>> +++ linux-2.6.28.i686/include/linux/nfs_fs.h
>>>>>> @@ -168,6 +168,7 @@ struct nfs_inode {
>>>>>>
>>>>>> unsigned long ncommit,
>>>>>> npages;
>>>>>> + atomic_t ndirty;
>>>>>>
>>>>>> /* Open contexts for shared mmap writes */
>>>>>> struct list_head open_files;
>>>>>> @@ -186,6 +187,9 @@ struct nfs_inode {
>>>>>> fmode_t delegation_state;
>>>>>> struct rw_semaphore rwsem;
>>>>>> #endif /* CONFIG_NFS_V4*/
>>>>>> + atomic_t writes; /* number of outstanding WRITEs */
>>>>>> + wait_queue_head_t writes_wq;
>>>>>> + loff_t wrpos; /* position after last WRITE */
>>>>>> struct inode vfs_inode;
>>>>>> };
>>>>>>
>>>>>> @@ -459,12 +463,14 @@ extern void nfs_unblock_sillyrename(stru
>>>>>> * linux/fs/nfs/write.c
>>>>>> */
>>>>>> extern int nfs_congestion_kb;
>>>>>> +extern int nfs_max_outstanding_writes;
>>>>>> extern int nfs_writepage(struct page *page, struct
>>>>>> writeback_control *wbc);
>>>>>> extern int nfs_writepages(struct address_space *, struct
>>>>>> writeback_control *);
>>>>>> extern int nfs_flush_incompatible(struct file *file, struct page
>>>>>> *page);
>>>>>> extern int nfs_updatepage(struct file *, struct page *, unsigned
>>>>>> int, unsigned int);
>>>>>> -extern int nfs_writeback_done(struct rpc_task *, struct
>>>>>> nfs_write_data *);
>>>>>> +extern int nfs_writeback_done(struct rpc_task *, struct
>>>>>> nfs_write_data *);
>>>>>> extern void nfs_writedata_release(void *);
>>>>>> +extern void nfs_wait_for_outstanding_writes(struct inode *);
>>>>>>
>>>>>> /*
>>>>>> * Try to write back everything synchronously (but check the
>>>>>> @@ -475,6 +481,7 @@ extern int nfs_wb_all(struct inode *inod
>>>>>> extern int nfs_wb_nocommit(struct inode *inode);
>>>>>> extern int nfs_wb_page(struct inode *inode, struct page* page);
>>>>>> extern int nfs_wb_page_cancel(struct inode *inode, struct page*
>>>>>> page);
>>>>>> +extern int nfs_wb_interim(struct inode *);
>>>>>> #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
>>>>>> extern int nfs_commit_inode(struct inode *, int);
>>>>>> extern struct nfs_write_data *nfs_commitdata_alloc(void);
>>>
>>> --Chuck Lever
>>> chuck[dot]lever[at]oracle[dot]com
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
> --Chuck Lever
> chuck[dot]lever[at]oracle[dot]com
>
>
>
>