Use netfslib's read and write iteration helpers, allowing netfslib to take
over the management of the page cache for 9p files and to manage local disk
caching. In particular, this eliminates write_begin, write_end, writepage
and all mentions of struct page and struct folio from 9p.
Note that netfslib now offers the possibility of write-through caching if
that is desirable for 9p: just set the NETFS_ICTX_WRITETHROUGH flag in
v9inode->netfs.flags in v9fs_set_netfs_context().
Note also this is untested as I can't get ganesha.nfsd to correctly parse
the config to turn on 9p support.
Signed-off-by: David Howells <[email protected]>
Reviewed-by: Jeff Layton <[email protected]>
cc: Eric Van Hensbergen <[email protected]>
cc: Latchesar Ionkov <[email protected]>
cc: Dominique Martinet <[email protected]>
cc: Christian Schoenebeck <[email protected]>
cc: [email protected]
cc: [email protected]
cc: [email protected]
---
Notes:
Changes
=======
ver #5)
- Added some missing remote_i_size setting.
- Added missing writepages (else mmap write never written back).
fs/9p/vfs_addr.c | 293 ++++++++++-------------------------------
fs/9p/vfs_file.c | 89 ++-----------
fs/9p/vfs_inode.c | 5 +-
fs/9p/vfs_inode_dotl.c | 7 +-
4 files changed, 85 insertions(+), 309 deletions(-)
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 055b672a247d..20f072c18ce9 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -19,12 +19,48 @@
#include <linux/netfs.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
+#include <trace/events/netfs.h>
#include "v9fs.h"
#include "v9fs_vfs.h"
#include "cache.h"
#include "fid.h"
+static void v9fs_upload_to_server(struct netfs_io_subrequest *subreq)
+{
+ struct inode *inode = subreq->rreq->inode;
+ struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode);
+ struct p9_fid *fid = subreq->rreq->netfs_priv;
+ int err;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
+ netfs_write_subrequest_terminated(subreq, err < 0 ? err : subreq->len,
+ false);
+}
+
+static void v9fs_upload_to_server_worker(struct work_struct *work)
+{
+ struct netfs_io_subrequest *subreq =
+ container_of(work, struct netfs_io_subrequest, work);
+
+ v9fs_upload_to_server(subreq);
+}
+
+/*
+ * Set up write requests for a writeback slice. We need to add a write request
+ * for each write we want to make.
+ */
+static void v9fs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len)
+{
+ struct netfs_io_subrequest *subreq;
+
+ subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER,
+ start, len, v9fs_upload_to_server_worker);
+ if (subreq)
+ netfs_queue_write_request(subreq);
+}
+
/**
* v9fs_issue_read - Issue a read from 9P
* @subreq: The read to make
@@ -33,14 +69,10 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
struct p9_fid *fid = rreq->netfs_priv;
- struct iov_iter to;
- loff_t pos = subreq->start + subreq->transferred;
- size_t len = subreq->len - subreq->transferred;
int total, err;
- iov_iter_xarray(&to, ITER_DEST, &rreq->mapping->i_pages, pos, len);
-
- total = p9_client_read(fid, pos, &to, &err);
+ total = p9_client_read(fid, subreq->start + subreq->transferred,
+ &subreq->io_iter, &err);
/* if we just extended the file size, any portion not in
* cache won't be on server and is zeroes */
@@ -50,23 +82,37 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
}
/**
- * v9fs_init_request - Initialise a read request
+ * v9fs_init_request - Initialise a request
* @rreq: The read request
* @file: The file being read from
*/
static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
{
- struct p9_fid *fid = file->private_data;
-
- BUG_ON(!fid);
+ struct p9_fid *fid;
+ bool writing = (rreq->origin == NETFS_READ_FOR_WRITE ||
+ rreq->origin == NETFS_WRITEBACK ||
+ rreq->origin == NETFS_WRITETHROUGH ||
+ rreq->origin == NETFS_LAUNDER_WRITE ||
+ rreq->origin == NETFS_UNBUFFERED_WRITE ||
+ rreq->origin == NETFS_DIO_WRITE);
+
+ if (file) {
+ fid = file->private_data;
+ BUG_ON(!fid);
+ p9_fid_get(fid);
+ } else {
+ fid = v9fs_fid_find_inode(rreq->inode, writing, INVALID_UID, true);
+ if (!fid) {
+ WARN_ONCE(1, "folio expected an open fid inode->i_private=%p\n",
+ rreq->inode->i_private);
+ return -EINVAL;
+ }
+ }
/* we might need to read from a fid that was opened write-only
* for read-modify-write of page cache, use the writeback fid
* for that */
- WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE &&
- !(fid->mode & P9_ORDWR));
-
- p9_fid_get(fid);
+ WARN_ON(writing && !(fid->mode & P9_ORDWR));
rreq->netfs_priv = fid;
return 0;
}
@@ -86,217 +132,16 @@ const struct netfs_request_ops v9fs_req_ops = {
.init_request = v9fs_init_request,
.free_request = v9fs_free_request,
.issue_read = v9fs_issue_read,
+ .create_write_requests = v9fs_create_write_requests,
};
-#ifdef CONFIG_9P_FSCACHE
-static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
- bool was_async)
-{
- struct v9fs_inode *v9inode = priv;
- __le32 version;
-
- if (IS_ERR_VALUE(transferred_or_error) &&
- transferred_or_error != -ENOBUFS) {
- version = cpu_to_le32(v9inode->qid.version);
- fscache_invalidate(v9fs_inode_cookie(v9inode), &version,
- i_size_read(&v9inode->netfs.inode), 0);
- }
-}
-#endif
-
-static int v9fs_vfs_write_folio_locked(struct folio *folio)
-{
- struct inode *inode = folio_inode(folio);
- loff_t start = folio_pos(folio);
- loff_t i_size = i_size_read(inode);
- struct iov_iter from;
- size_t len = folio_size(folio);
- struct p9_fid *writeback_fid;
- int err;
- struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode);
- struct fscache_cookie __maybe_unused *cookie = v9fs_inode_cookie(v9inode);
-
- if (start >= i_size)
- return 0; /* Simultaneous truncation occurred */
-
- len = min_t(loff_t, i_size - start, len);
-
- iov_iter_xarray(&from, ITER_SOURCE, &folio_mapping(folio)->i_pages, start, len);
-
- writeback_fid = v9fs_fid_find_inode(inode, true, INVALID_UID, true);
- if (!writeback_fid) {
- WARN_ONCE(1, "folio expected an open fid inode->i_private=%p\n",
- inode->i_private);
- return -EINVAL;
- }
-
- folio_wait_fscache(folio);
- folio_start_writeback(folio);
-
- p9_client_write(writeback_fid, start, &from, &err);
-
-#ifdef CONFIG_9P_FSCACHE
- if (err == 0 &&
- fscache_cookie_enabled(cookie) &&
- test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) {
- folio_start_fscache(folio);
- fscache_write_to_cache(v9fs_inode_cookie(v9inode),
- folio_mapping(folio), start, len, i_size,
- v9fs_write_to_cache_done, v9inode,
- true);
- }
-#endif
-
- folio_end_writeback(folio);
- p9_fid_put(writeback_fid);
-
- return err;
-}
-
-static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct folio *folio = page_folio(page);
- int retval;
-
- p9_debug(P9_DEBUG_VFS, "folio %p\n", folio);
-
- retval = v9fs_vfs_write_folio_locked(folio);
- if (retval < 0) {
- if (retval == -EAGAIN) {
- folio_redirty_for_writepage(wbc, folio);
- retval = 0;
- } else {
- mapping_set_error(folio_mapping(folio), retval);
- }
- } else
- retval = 0;
-
- folio_unlock(folio);
- return retval;
-}
-
-static int v9fs_launder_folio(struct folio *folio)
-{
- int retval;
-
- if (folio_clear_dirty_for_io(folio)) {
- retval = v9fs_vfs_write_folio_locked(folio);
- if (retval)
- return retval;
- }
- folio_wait_fscache(folio);
- return 0;
-}
-
-/**
- * v9fs_direct_IO - 9P address space operation for direct I/O
- * @iocb: target I/O control block
- * @iter: The data/buffer to use
- *
- * The presence of v9fs_direct_IO() in the address space ops vector
- * allowes open() O_DIRECT flags which would have failed otherwise.
- *
- * In the non-cached mode, we shunt off direct read and write requests before
- * the VFS gets them, so this method should never be called.
- *
- * Direct IO is not 'yet' supported in the cached mode. Hence when
- * this routine is called through generic_file_aio_read(), the read/write fails
- * with an error.
- *
- */
-static ssize_t
-v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- loff_t pos = iocb->ki_pos;
- ssize_t n;
- int err = 0;
-
- if (iov_iter_rw(iter) == WRITE) {
- n = p9_client_write(file->private_data, pos, iter, &err);
- if (n) {
- struct inode *inode = file_inode(file);
- loff_t i_size = i_size_read(inode);
-
- if (pos + n > i_size)
- inode_add_bytes(inode, pos + n - i_size);
- }
- } else {
- n = p9_client_read(file->private_data, pos, iter, &err);
- }
- return n ? n : err;
-}
-
-static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned int len,
- struct page **subpagep, void **fsdata)
-{
- int retval;
- struct folio *folio;
- struct v9fs_inode *v9inode = V9FS_I(mapping->host);
-
- p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
-
- /* Prefetch area to be written into the cache if we're caching this
- * file. We need to do this before we get a lock on the page in case
- * there's more than one writer competing for the same cache block.
- */
- retval = netfs_write_begin(&v9inode->netfs, filp, mapping, pos, len, &folio, fsdata);
- if (retval < 0)
- return retval;
-
- *subpagep = &folio->page;
- return retval;
-}
-
-static int v9fs_write_end(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int copied,
- struct page *subpage, void *fsdata)
-{
- loff_t last_pos = pos + copied;
- struct folio *folio = page_folio(subpage);
- struct inode *inode = mapping->host;
-
- p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
-
- if (!folio_test_uptodate(folio)) {
- if (unlikely(copied < len)) {
- copied = 0;
- goto out;
- }
-
- folio_mark_uptodate(folio);
- }
-
- /*
- * No need to use i_size_read() here, the i_size
- * cannot change under us because we hold the i_mutex.
- */
- if (last_pos > inode->i_size) {
- inode_add_bytes(inode, last_pos - inode->i_size);
- i_size_write(inode, last_pos);
-#ifdef CONFIG_9P_FSCACHE
- fscache_update_cookie(v9fs_inode_cookie(V9FS_I(inode)), NULL,
- &last_pos);
-#endif
- }
- folio_mark_dirty(folio);
-out:
- folio_unlock(folio);
- folio_put(folio);
-
- return copied;
-}
-
const struct address_space_operations v9fs_addr_operations = {
- .read_folio = netfs_read_folio,
- .readahead = netfs_readahead,
- .dirty_folio = netfs_dirty_folio,
- .writepage = v9fs_vfs_writepage,
- .write_begin = v9fs_write_begin,
- .write_end = v9fs_write_end,
- .release_folio = netfs_release_folio,
- .invalidate_folio = netfs_invalidate_folio,
- .launder_folio = v9fs_launder_folio,
- .direct_IO = v9fs_direct_IO,
+ .read_folio = netfs_read_folio,
+ .readahead = netfs_readahead,
+ .dirty_folio = netfs_dirty_folio,
+ .release_folio = netfs_release_folio,
+ .invalidate_folio = netfs_invalidate_folio,
+ .launder_folio = netfs_launder_folio,
+ .direct_IO = noop_direct_IO,
+ .writepages = netfs_writepages,
};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 11cd8d23f6f2..bae330c2f0cf 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -353,25 +353,15 @@ static ssize_t
v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct p9_fid *fid = iocb->ki_filp->private_data;
- int ret, err = 0;
p9_debug(P9_DEBUG_VFS, "fid %d count %zu offset %lld\n",
fid->fid, iov_iter_count(to), iocb->ki_pos);
- if (!(fid->mode & P9L_DIRECT)) {
- p9_debug(P9_DEBUG_VFS, "(cached)\n");
- return generic_file_read_iter(iocb, to);
- }
-
- if (iocb->ki_filp->f_flags & O_NONBLOCK)
- ret = p9_client_read_once(fid, iocb->ki_pos, to, &err);
- else
- ret = p9_client_read(fid, iocb->ki_pos, to, &err);
- if (!ret)
- return err;
+ if (fid->mode & P9L_DIRECT)
+ return netfs_unbuffered_read_iter(iocb, to);
- iocb->ki_pos += ret;
- return ret;
+ p9_debug(P9_DEBUG_VFS, "(cached)\n");
+ return netfs_file_read_iter(iocb, to);
}
/*
@@ -407,46 +397,14 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct p9_fid *fid = file->private_data;
- ssize_t retval;
- loff_t origin;
- int err = 0;
p9_debug(P9_DEBUG_VFS, "fid %d\n", fid->fid);
- if (!(fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE))) {
- p9_debug(P9_DEBUG_CACHE, "(cached)\n");
- return generic_file_write_iter(iocb, from);
- }
+ if (fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE))
+ return netfs_unbuffered_write_iter(iocb, from);
- retval = generic_write_checks(iocb, from);
- if (retval <= 0)
- return retval;
-
- origin = iocb->ki_pos;
- retval = p9_client_write(file->private_data, iocb->ki_pos, from, &err);
- if (retval > 0) {
- struct inode *inode = file_inode(file);
- loff_t i_size;
- unsigned long pg_start, pg_end;
-
- pg_start = origin >> PAGE_SHIFT;
- pg_end = (origin + retval - 1) >> PAGE_SHIFT;
- if (inode->i_mapping && inode->i_mapping->nrpages)
- invalidate_inode_pages2_range(inode->i_mapping,
- pg_start, pg_end);
- iocb->ki_pos += retval;
- i_size = i_size_read(inode);
- if (iocb->ki_pos > i_size) {
- inode_add_bytes(inode, iocb->ki_pos - i_size);
- /*
- * Need to serialize against i_size_write() in
- * v9fs_stat2inode()
- */
- v9fs_i_size_write(inode, iocb->ki_pos);
- }
- return retval;
- }
- return err;
+ p9_debug(P9_DEBUG_CACHE, "(cached)\n");
+ return netfs_file_write_iter(iocb, from);
}
static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end,
@@ -519,36 +477,7 @@ v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
static vm_fault_t
v9fs_vm_page_mkwrite(struct vm_fault *vmf)
{
- struct folio *folio = page_folio(vmf->page);
- struct file *filp = vmf->vma->vm_file;
- struct inode *inode = file_inode(filp);
-
-
- p9_debug(P9_DEBUG_VFS, "folio %p fid %lx\n",
- folio, (unsigned long)filp->private_data);
-
- /* Wait for the page to be written to the cache before we allow it to
- * be modified. We then assume the entire page will need writing back.
- */
-#ifdef CONFIG_9P_FSCACHE
- if (folio_test_fscache(folio) &&
- folio_wait_fscache_killable(folio) < 0)
- return VM_FAULT_NOPAGE;
-#endif
-
- /* Update file times before taking page lock */
- file_update_time(filp);
-
- if (folio_lock_killable(folio) < 0)
- return VM_FAULT_RETRY;
- if (folio_mapping(folio) != inode->i_mapping)
- goto out_unlock;
- folio_wait_stable(folio);
-
- return VM_FAULT_LOCKED;
-out_unlock:
- folio_unlock(folio);
- return VM_FAULT_NOPAGE;
+ return netfs_page_mkwrite(vmf, NULL);
}
static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 74122540e00f..55345753ae8d 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -374,10 +374,8 @@ void v9fs_evict_inode(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
-#ifdef CONFIG_9P_FSCACHE
version = cpu_to_le32(v9inode->qid.version);
netfs_clear_inode_writeback(inode, &version);
-#endif
clear_inode(inode);
filemap_fdatawrite(&inode->i_data);
@@ -1112,7 +1110,7 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap,
if ((iattr->ia_valid & ATTR_SIZE) &&
iattr->ia_size != i_size_read(inode)) {
truncate_setsize(inode, iattr->ia_size);
- truncate_pagecache(inode, iattr->ia_size);
+ netfs_resize_file(netfs_inode(inode), iattr->ia_size, true);
#ifdef CONFIG_9P_FSCACHE
if (v9ses->cache & CACHE_FSCACHE) {
@@ -1180,6 +1178,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
+ v9inode->netfs.remote_i_size = stat->length;
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
v9fs_i_size_write(inode, stat->length);
/* not real number of blocks, but 512 byte ones ... */
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index c7319af2f471..e25fbc988f09 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -598,7 +598,7 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap,
if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size !=
i_size_read(inode)) {
truncate_setsize(inode, iattr->ia_size);
- truncate_pagecache(inode, iattr->ia_size);
+ netfs_resize_file(netfs_inode(inode), iattr->ia_size, true);
#ifdef CONFIG_9P_FSCACHE
if (v9ses->cache & CACHE_FSCACHE)
@@ -655,6 +655,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
+ v9inode->netfs.remote_i_size = stat->st_size;
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
v9fs_i_size_write(inode, stat->st_size);
inode->i_blocks = stat->st_blocks;
@@ -683,8 +684,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
inode->i_mode = mode;
}
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) &&
- stat->st_result_mask & P9_STATS_SIZE)
+ stat->st_result_mask & P9_STATS_SIZE) {
+ v9inode->netfs.remote_i_size = stat->st_size;
v9fs_i_size_write(inode, stat->st_size);
+ }
if (stat->st_result_mask & P9_STATS_BLOCKS)
inode->i_blocks = stat->st_blocks;
}
This needs a fix that I would fold in. Somehow it gets through xfstests
without it, but it seems problems can be caused with executables.
David
---
9p: Fix initialisation of netfs_inode for 9p
The 9p filesystem is calling netfs_inode_init() in v9fs_init_inode() -
before the struct inode fields have been initialised from the obtained file
stats (ie. after v9fs_stat2inode*() has been called), but netfslib wants to
set a couple of its fields from i_size.
Reported-by: Marc Dionne <[email protected]>
Signed-off-by: David Howells <[email protected]>
Tested-by: Marc Dionne <[email protected]>
cc: Eric Van Hensbergen <[email protected]>
cc: Latchesar Ionkov <[email protected]>
cc: Dominique Martinet <[email protected]>
cc: Christian Schoenebeck <[email protected]>
cc: [email protected]
cc: [email protected]
cc: [email protected]
---
fs/9p/v9fs_vfs.h | 1 +
fs/9p/vfs_inode.c | 6 +++---
fs/9p/vfs_inode_dotl.c | 1 +
3 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 731e3d14b67d..0e8418066a48 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -42,6 +42,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb);
void v9fs_free_inode(struct inode *inode);
struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode,
dev_t rdev);
+void v9fs_set_netfs_context(struct inode *inode);
int v9fs_init_inode(struct v9fs_session_info *v9ses,
struct inode *inode, umode_t mode, dev_t rdev);
void v9fs_evict_inode(struct inode *inode);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b66466e97459..32572982f72e 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -246,7 +246,7 @@ void v9fs_free_inode(struct inode *inode)
/*
* Set parameters for the netfs library
*/
-static void v9fs_set_netfs_context(struct inode *inode)
+void v9fs_set_netfs_context(struct inode *inode)
{
struct v9fs_inode *v9inode = V9FS_I(inode);
netfs_inode_init(&v9inode->netfs, &v9fs_req_ops, true);
@@ -326,8 +326,6 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
err = -EINVAL;
goto error;
}
-
- v9fs_set_netfs_context(inode);
error:
return err;
@@ -359,6 +357,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
iput(inode);
return ERR_PTR(err);
}
+ v9fs_set_netfs_context(inode);
return inode;
}
@@ -461,6 +460,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
goto error;
v9fs_stat2inode(st, inode, sb, 0);
+ v9fs_set_netfs_context(inode);
v9fs_cache_inode_get_cookie(inode);
unlock_new_inode(inode);
return inode;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index e25fbc988f09..3505227e1704 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -128,6 +128,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
goto error;
v9fs_stat2inode_dotl(st, inode, 0);
+ v9fs_set_netfs_context(inode);
v9fs_cache_inode_get_cookie(inode);
retval = v9fs_get_acl(inode, fid);
if (retval)
Dominique Martinet <[email protected]> wrote:
> I've also manually confirmed one of the big improvements I'd been asking
> for (that writes in cached modes, which used to be chunked to 4k, and
> are now properly aggregated, so e.g 'dd bs=1M count=1' will properly
> issue a minimal number of TWRITE calls capped by msize) -- this is
> great!
After the merge window, we can look at enabling multipage folios for 9p.
> I've noticed we don't cache xattrs are all,
I haven't given this any particular thought. We could attach them to the
cachefile object as xattrs, but it means you have to do two xattr lookups in
the event of a cache miss.
At this point, I'm going to ask Christian to stack the extra patch on his
branch rather than folding it down and retagging.
> I've got a couple of questions below, but:
I'll address those separately.
> Tested-by: Dominique Martinet <[email protected]>
> Acked-by: Dominique Martinet <[email protected]>
Thanks!
David
Dominique Martinet <[email protected]> wrote:
> > +static void v9fs_upload_to_server(struct netfs_io_subrequest *subreq)
> > +{
> > + struct inode *inode = subreq->rreq->inode;
> > + struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode);
>
> Any reason to have this variable assignment at all?
I'll just remove it. The __maybe_unused suppressed the warning, otherwise I'd
have removed it already.
> p9_client_write return value should always be subreq->len, but I believe
> we should use it unless err is set.
> (It's also possible for partial writes to happen, e.g. p9_client_write
> looped a few times and then failed, at which point the size returned
> would be the amount that actually got through -- we probably should do
> something with that?)
How about something like:
- int err;
+ int err, len;
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
- p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
- netfs_write_subrequest_terminated(subreq, err < 0 ? err : subreq->len,
- false);
+ len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
+ netfs_write_subrequest_terminated(subreq, len ?: err, false);
> > + total = p9_client_read(fid, subreq->start + subreq->transferred,
> > + &subreq->io_iter, &err);
>
> Just to clarify: subreq->io_iter didn't exist (or some conditions to use
> it weren't cleared) before?
Correct. It's added in the netfs-lib patches. I've provided a way to
separate the user-side iterator from the I/O-side iterator to allow the use of
a bounce buffer for the purpose of content crypto, compression or just having
to deal with RMW cycles to a larger block size on the server.
> > + if (file) {
> > + fid = file->private_data;
> > + BUG_ON(!fid);
>
> This probably should be WARN + return EINVAL like find by inode?
> It's certainly a huge problem, but we should avoid BUG if possible...
Sure. The BUG_ON() was already there, but I can turn it into a WARN+error.
> nit: not sure what's cleaner?
> Since there's a message that makes for a bit awkward if...
>
> if (WARN_ONCE(!fid, "folio expected an open fid inode->i_private=%p\n",
> rreq->inode->i_private))
> return -EINVAL;
>
> (as a side note, I'm not sure what to make of this i_private pointer
> here, but if that'll help you figure something out sure..)
Um. 9p is using i_private. But perhaps i_ino would be a better choice:
if (file) {
fid = file->private_data;
if (!fid)
goto no_fid;
p9_fid_get(fid);
} else {
fid = v9fs_fid_find_inode(rreq->inode, writing, INVALID_UID, true);
if (!fid)
goto no_fid;
}
...
no_fid:
WARN_ONCE(1, "folio expected an open fid inode->i_ino=%lx\n",
rreq->inode->i_ino);
return -EINVAL;
> This is as follow on your netfs-lib branch:
> - WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE &&
> - !(fid->mode & P9_ORDWR));
> -
> - p9_fid_get(fid);
> + WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE && !(fid->mode & P9_ORDWR));
>
> So the WARN_ON has been reverted back with only indentation changed;
> I guess there were patterns that were writing despite the fid not having
> been open as RDWR?
> Do you still have details about these?
The condition in the WARN_ON() here got changed. It was:
WARN_ON(writing && ...
at one point, but that caused a bunch of incorrect warning to appear because
only NETFS_READ_FOR_WRITE requires read-access as well as write-access. All
the others:
bool writing = (rreq->origin == NETFS_READ_FOR_WRITE ||
rreq->origin == NETFS_WRITEBACK ||
rreq->origin == NETFS_WRITETHROUGH ||
rreq->origin == NETFS_LAUNDER_WRITE ||
rreq->origin == NETFS_UNBUFFERED_WRITE ||
rreq->origin == NETFS_DIO_WRITE);
only require write-access.
There will be an additional one if we roll out content crypto to 9p as we may
need to do RMW cycles occasionally - but that's off to one side just for the
moment.
> If a file has been open without the write bit it might not go through,
> and it's incredibly difficult to get such users back to userspace in
> async cases (e.g. mmap flushes), so would like to understand that.
The VFS/VM should prevent writing to files that aren't open O_WRONLY or
O_RDWR, so I don't think we should be called in otherwise.
Note that I'm intending to change the way fscache is driven when we fetch
cacheable data from the server so that I can free up the PG_fscache bit and
return it to the MM folks. Instead of marking the page PG_fscache, I mark it
PG_dirty and set page->private with a special value to indicate it should only
be written to the cache - then the writepages sees that and just writes these
pages to the cache. I have a patch to do this and it seems to work, but I
need to make ceph and cifs use netfslib before I can apply it.
> > + p9_debug(P9_DEBUG_VFS, "(cached)\n");
>
> (Not a new problem so no need to address here, but having just
> "(cached)" on a split line is a bit weird.. We first compute cached or
> not as a bool and make it %s + cached ? " (cached)" : "" or
> something... I'll send a patch after this gets in to avoid conflicts)
Okay.
> > + return netfs_page_mkwrite(vmf, NULL);
>
> (I guess there's no helper that could be used directly in .page_mkwrite
> op?)
I could provide a helper that just supplies NULL as the second argument. I
think only 9p will use it, but that's fine.
David
David Howells wrote on Wed, Jan 03, 2024 at 12:39:34PM +0000:
> > p9_client_write return value should always be subreq->len, but I believe
> > we should use it unless err is set.
> > (It's also possible for partial writes to happen, e.g. p9_client_write
> > looped a few times and then failed, at which point the size returned
> > would be the amount that actually got through -- we probably should do
> > something with that?)
>
> How about something like:
>
> - int err;
> + int err, len;
>
> trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
> - p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
> - netfs_write_subrequest_terminated(subreq, err < 0 ? err : subreq->len,
> - false);
> + len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
> + netfs_write_subrequest_terminated(subreq, len ?: err, false);
I think that'll be fine; plain write() syscall works like this when an
error happens after some data has been flushed, and I assume there'll be
some retry if this happpened on something like a flush dirty and it got
a partial write reported?
> > > + if (file) {
> > > + fid = file->private_data;
> > > + BUG_ON(!fid);
> >
> > This probably should be WARN + return EINVAL like find by inode?
> > It's certainly a huge problem, but we should avoid BUG if possible...
>
> Sure. The BUG_ON() was already there, but I can turn it into a WARN+error.
Thanks.
> > nit: not sure what's cleaner?
> > Since there's a message that makes for a bit awkward if...
> >
> > if (WARN_ONCE(!fid, "folio expected an open fid inode->i_private=%p\n",
> > rreq->inode->i_private))
> > return -EINVAL;
> >
> > (as a side note, I'm not sure what to make of this i_private pointer
> > here, but if that'll help you figure something out sure..)
>
> Um. 9p is using i_private. But perhaps i_ino would be a better choice:
>
> if (file) {
> fid = file->private_data;
> if (!fid)
> goto no_fid;
> p9_fid_get(fid);
> } else {
> fid = v9fs_fid_find_inode(rreq->inode, writing, INVALID_UID, true);
> if (!fid)
> goto no_fid;
> }
>
> ...
>
> no_fid:
> WARN_ONCE(1, "folio expected an open fid inode->i_ino=%lx\n",
> rreq->inode->i_ino);
> return -EINVAL;
Might be useful to track down if this came frm a file without private
data or lookup failing, but given this was a bug I guess we can deal
with that when that happens -- ack.
> > This is as follow on your netfs-lib branch:
> > - WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE &&
> > - !(fid->mode & P9_ORDWR));
> > -
> > - p9_fid_get(fid);
> > + WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE && !(fid->mode & P9_ORDWR));
> >
> > So the WARN_ON has been reverted back with only indentation changed;
> > I guess there were patterns that were writing despite the fid not having
> > been open as RDWR?
> > Do you still have details about these?
>
> The condition in the WARN_ON() here got changed. It was:
>
> WARN_ON(writing && ...
>
> at one point, but that caused a bunch of incorrect warning to appear because
> only NETFS_READ_FOR_WRITE requires read-access as well as write-access. All
> the others:
>
> bool writing = (rreq->origin == NETFS_READ_FOR_WRITE ||
> rreq->origin == NETFS_WRITEBACK ||
> rreq->origin == NETFS_WRITETHROUGH ||
> rreq->origin == NETFS_LAUNDER_WRITE ||
> rreq->origin == NETFS_UNBUFFERED_WRITE ||
> rreq->origin == NETFS_DIO_WRITE);
>
> only require write-access.
Thanks for clarifying
> > If a file has been open without the write bit it might not go through,
> > and it's incredibly difficult to get such users back to userspace in
> > async cases (e.g. mmap flushes), so would like to understand that.
>
> The VFS/VM should prevent writing to files that aren't open O_WRONLY or
> O_RDWR, so I don't think we should be called in otherwise.
Historically this check was more about finding a fid that wasn't opened
properly than the VFS doing something weird (e.g. by calling mprotect
after mmap and us missing that -- would need to check if that works
actually...)
> > > + return netfs_page_mkwrite(vmf, NULL);
> >
> > (I guess there's no helper that could be used directly in .page_mkwrite
> > op?)
>
> I could provide a helper that just supplies NULL as the second argument. I
> think only 9p will use it, but that's fine.
If we're the only user I guess we shouldn't bother with it at this
point, we can come back to it if this ever becomes common.
--
Dominique Martinet | Asmadeus
Hi Andrea,
Note that there are changes to the netfslib write-side upstream and you might
also want to apply the attached.
In https://bugs.launchpad.net/ubuntu/+source/autopkgtest/+bug/2056461 you say:
| It seems that kernel 6.8 introduced a regression in the 9pfs related to
| caching and netfslib, that can cause some user-space apps to read content
| from files that is not up-to-date (when they are used in a producer/consumer
| fashion).
Can you clarify how these files are being used?
David
---
commit 39302c160390441ed5b4f4f7ad480c44eddf0962
Author: David Howells <[email protected]>
Date: Wed May 22 17:30:22 2024 +0100
netfs, 9p: Fix race between umount and async request completion
There's a problem in 9p's interaction with netfslib whereby a crash occurs
because the 9p_fid structs get forcibly destroyed during client teardown
(without paying attention to their refcounts) before netfslib has finished
with them. However, it's not a simple case of deferring the clunking that
p9_fid_put() does as that requires the client.
The problem is that netfslib has to unlock pages and clear the IN_PROGRESS
flag before destroying the objects involved - including the pid - and, in
any case, nothing checks to see if writeback completed barring looking at
the page flags.
Fix this by keeping a count of outstanding I/O requests (of any type) and
waiting for it to quiesce during inode eviction.
Signed-off-by: David Howells <[email protected]>
cc: Eric Van Hensbergen <[email protected]>
cc: Latchesar Ionkov <[email protected]>
cc: Dominique Martinet <[email protected]>
cc: Christian Schoenebeck <[email protected]>
cc: Jeff Layton <[email protected]>
cc: Steve French <[email protected]>
cc: [email protected]
cc: [email protected]
cc: [email protected]
cc: [email protected]
cc: [email protected]
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8c9a896d691e..57cfa9f65046 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -354,6 +354,7 @@ void v9fs_evict_inode(struct inode *inode)
version = cpu_to_le32(v9inode->qid.version);
netfs_clear_inode_writeback(inode, &version);
+ netfs_wait_for_outstanding_io(inode);
clear_inode(inode);
filemap_fdatawrite(&inode->i_data);
@@ -361,8 +362,10 @@ void v9fs_evict_inode(struct inode *inode)
if (v9fs_inode_cookie(v9inode))
fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false);
#endif
- } else
+ } else {
+ netfs_wait_for_outstanding_io(inode);
clear_inode(inode);
+ }
}
struct inode *
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 94fc049aff58..c831e711a4ac 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -652,6 +652,7 @@ void afs_evict_inode(struct inode *inode)
afs_set_cache_aux(vnode, &aux);
netfs_clear_inode_writeback(inode, &aux);
+ netfs_wait_for_outstanding_io(inode);
clear_inode(inode);
while (!list_empty(&vnode->wb_keys)) {
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index c90d482b1650..f4a642727479 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -72,6 +72,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
}
}
+ atomic_inc(&ctx->io_count);
trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new);
netfs_proc_add_rreq(rreq);
netfs_stat(&netfs_n_rh_rreq);
@@ -124,6 +125,7 @@ static void netfs_free_request(struct work_struct *work)
{
struct netfs_io_request *rreq =
container_of(work, struct netfs_io_request, work);
+ struct netfs_inode *ictx = netfs_inode(rreq->inode);
unsigned int i;
trace_netfs_rreq(rreq, netfs_rreq_trace_free);
@@ -142,6 +144,9 @@ static void netfs_free_request(struct work_struct *work)
}
kvfree(rreq->direct_bv);
}
+
+ if (atomic_dec_and_test(&ictx->io_count))
+ wake_up_var(&ictx->io_count);
call_rcu(&rreq->rcu, netfs_free_request_rcu);
}
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index ec5b639f421a..21c9e173ea9a 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -435,6 +435,7 @@ cifs_evict_inode(struct inode *inode)
if (inode->i_state & I_PINNING_NETFS_WB)
cifs_fscache_unuse_inode_cookie(inode, true);
cifs_fscache_release_inode_cookie(inode);
+ netfs_wait_for_outstanding_io(inode);
clear_inode(inode);
}
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index d2d291a9cdad..3ca3906bb8da 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -68,6 +68,7 @@ struct netfs_inode {
loff_t remote_i_size; /* Size of the remote file */
loff_t zero_point; /* Size after which we assume there's no data
* on the server */
+ atomic_t io_count; /* Number of outstanding reqs */
unsigned long flags;
#define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */
#define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */
@@ -474,6 +475,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx,
ctx->remote_i_size = i_size_read(&ctx->inode);
ctx->zero_point = LLONG_MAX;
ctx->flags = 0;
+ atomic_set(&ctx->io_count, 0);
#if IS_ENABLED(CONFIG_FSCACHE)
ctx->cache = NULL;
#endif
@@ -517,4 +519,20 @@ static inline struct fscache_cookie *netfs_i_cookie(struct netfs_inode *ctx)
#endif
}
+/**
+ * netfs_wait_for_outstanding_io - Wait for outstanding I/O to complete
+ * @ctx: The netfs inode to wait on
+ *
+ * Wait for outstanding I/O requests of any type to complete. This is intended
+ * to be called from inode eviction routines. This makes sure that any
+ * resources held by those requests are cleaned up before we let the inode get
+ * cleaned up.
+ */
+static inline void netfs_wait_for_outstanding_io(struct inode *inode)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+
+ wait_var_event(&ictx->io_count, atomic_read(&ictx->io_count) == 0);
+}
+
#endif /* _LINUX_NETFS_H */
Hi David,
On 2024-05-23 08:44, David Howells wrote:
> In https://bugs.launchpad.net/ubuntu/+source/autopkgtest/+bug/2056461 you say:
>
> | It seems that kernel 6.8 introduced a regression in the 9pfs related to
> | caching and netfslib, that can cause some user-space apps to read content
> | from files that is not up-to-date (when they are used in a producer/consumer
> | fashion).
>
> Can you clarify how these files are being used?
I don't know the details of the 9pfs operations involved, but still I
wanted to mention that to reliably reproduce the issue on a Debian
system one can run:
autopkgtest-build-qemu unstable /tmp/sid.img
autopkgtest -ddd -B dpdk -- autopkgtest-virt-qemu --debug --show-boot /tmp/sid.img
If the kernel installed in the guest VM is affected by this problem,
after a while the test hangs with something like:
autopkgtest-virt-qemu: DBG: executing copydown /tmp/alog/tests-tree/ /tmp/autopkgtest.uG6tsJ/build.6QA/src/
[...]
autopkgtest-virt-qemu: DBG: +>?
Full logs at https://people.debian.org/~ema/1072004-6.10-rc1.log
Part of the code mounting the 9pfs in case it helps is at:
https://salsa.debian.org/ci-team/autopkgtest/-/blob/master/virt/autopkgtest-virt-qemu#L290
I could reproduce the issue with both 6.9.2 and 6.10-rc1.
Hi again,
On 2024-05-23 08:44, David Howells wrote:
> commit 39302c160390441ed5b4f4f7ad480c44eddf0962
> Author: David Howells <[email protected]>
> Date: Wed May 22 17:30:22 2024 +0100
>
> netfs, 9p: Fix race between umount and async request completion
I have tried this patch on top of 6.10-rc1 and unfortunately the problem
persists.
Meanwhile TJ (in CC) has been doing a lot of further investigation and
opened https://bugzilla.kernel.org/show_bug.cgi?id=218916.