2023-07-11 05:06:56

by Jiachen Zhang

[permalink] [raw]
Subject: [PATCH 4/5] fuse: writeback_cache consistency enhancement (writeback_cache_v2)

Some users may want both the high performance of the writeback_cahe mode
and a little bit more consistency among FUSE mounts. Current
writeback_cache mode never updates attributes from server, so can never
see the file attributes changed by other FUSE mounts, which means
'zero-consisteny'.

This commit introduces writeback_cache_v2 mode, which allows the attributes
to be updated from server to kernel when the inode is clean and no
writeback is in-progressing. FUSE daemons can select this mode by the
FUSE_WRITEBACK_CACHE_V2 init flag.

In writeback_cache_v2 mode, the server generates official attributes.
Therefore,

1. For the cmtime, the cmtime generated by kernel are just temporary
values that are never flushed to server by fuse_write_inode(), and they
could be eventually updated by the official server cmtime. The
mtime-based revalidation of the fc->auto_inval_data mode is also
skipped, as the kernel-generated temporary cmtime are likely not equal
to the offical server cmtime.

2. For the file size, we expect server updates its file size on
FUSE_WRITEs. So we increase fi->attr_version in fuse_writepage_end() to
check the staleness of the returning file size.

Together with FOPEN_INVAL_ATTR, a FUSE daemon is able to implement
close-to-open (CTO) consistency like NFS client implementations.

Signed-off-by: Jiachen Zhang <[email protected]>
---
fs/fuse/file.c | 25 +++++++++++++++++++++++
fs/fuse/fuse_i.h | 6 ++++++
fs/fuse/inode.c | 42 +++++++++++++++++++++++++++++++++++++--
include/uapi/linux/fuse.h | 9 ++++++++-
4 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 412824a11b7b..09416caea575 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1914,6 +1914,10 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
*/
fuse_send_writepage(fm, next, inarg->offset + inarg->size);
}
+
+ if (fc->writeback_cache_v2)
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
+
fi->writectr--;
fuse_writepage_finish(fm, wpa);
spin_unlock(&fi->lock);
@@ -1943,10 +1947,18 @@ static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi)

int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
{
+ struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_file *ff;
int err;

+ /*
+ * Kernel c/mtime should not be updated to the server in the
+ * writeback_cache_v2 mode as server c/mtime are official.
+ */
+ if (fc->writeback_cache_v2)
+ return 0;
+
/*
* Inode is always written before the last reference is dropped and
* hence this should not be reached from reclaim.
@@ -2375,11 +2387,14 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
{
pgoff_t index = pos >> PAGE_SHIFT;
struct fuse_conn *fc = get_fuse_conn(file_inode(file));
+ struct fuse_inode *fi = get_fuse_inode(file_inode(file));
struct page *page;
loff_t fsize;
int err = -ENOMEM;

WARN_ON(!fc->writeback_cache);
+ if (fc->writeback_cache_v2)
+ mutex_lock(&fi->wb_attr_mutex);

page = grab_cache_page_write_begin(mapping, index);
if (!page)
@@ -2411,6 +2426,9 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
unlock_page(page);
put_page(page);
error:
+ if (fc->writeback_cache_v2)
+ mutex_unlock(&fi->wb_attr_mutex);
+
return err;
}

@@ -2419,6 +2437,7 @@ static int fuse_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = page->mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);

/* Haven't copied anything? Skip zeroing, size extending, dirtying. */
if (!copied)
@@ -2442,6 +2461,12 @@ static int fuse_write_end(struct file *file, struct address_space *mapping,
unlock_page(page);
put_page(page);

+ if (fc->writeback_cache_v2) {
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ mutex_unlock(&fi->wb_attr_mutex);
+ }
+
return copied;
}

diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 9b7fc7d3c7f1..200be199eb93 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -155,6 +155,9 @@ struct fuse_inode {
*/
struct fuse_inode_dax *dax;
#endif
+
+ /** Lock for serializing size updates in writeback_cache_v2 mode */
+ struct mutex wb_attr_mutex;
};

/** FUSE inode state bits */
@@ -656,6 +659,9 @@ struct fuse_conn {
/* show legacy mount options */
unsigned int legacy_opts_show:1;

+ /* Improved writeback cache policy */
+ unsigned writeback_cache_v2:1;
+
/*
* fs kills suid/sgid/cap on write/chown/trunc. suid is killed on
* write/trunc only if caller did not have CAP_FSETID. sgid is killed
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3e0b1fb1db17..958f8534a585 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -84,6 +84,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
fi->orig_ino = 0;
fi->state = 0;
mutex_init(&fi->mutex);
+ mutex_init(&fi->wb_attr_mutex);
spin_lock_init(&fi->lock);
fi->forget = fuse_alloc_forget();
if (!fi->forget)
@@ -246,14 +247,36 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
u32 cache_mask;
loff_t oldsize;
struct timespec64 old_mtime;
+ bool try_wb_update = false;
+
+ if (fc->writeback_cache_v2 && S_ISREG(inode->i_mode)) {
+ mutex_lock(&fi->wb_attr_mutex);
+ try_wb_update = true;
+ }

spin_lock(&fi->lock);
/*
* In case of writeback_cache enabled, writes update mtime, ctime and
* may update i_size. In these cases trust the cached value in the
* inode.
+ *
+ * In writeback_cache_v2 mode, if all the following conditions are met,
+ * then we allow the attributes to be refreshed:
+ *
+ * - inode is not in the process of being written (I_SYNC)
+ * - inode has no dirty pages (I_DIRTY_PAGES)
+ * - inode data-related attributes are clean (I_DIRTY_DATASYNC)
+ * - inode does not have any page writeback in progress
+ *
+ * Note: checking PAGECACHE_TAG_WRITEBACK is not sufficient in fuse,
+ * since inode can appear to have no PageWriteback pages, yet still have
+ * outstanding write request.
*/
cache_mask = fuse_get_cache_mask(inode);
+ if (try_wb_update && !(inode->i_state & (I_DIRTY_PAGES | I_SYNC |
+ I_DIRTY_DATASYNC)) && RB_EMPTY_ROOT(&fi->writepages))
+ cache_mask &= ~(STATX_MTIME | STATX_CTIME | STATX_SIZE);
+
if (cache_mask & STATX_SIZE)
attr->size = i_size_read(inode);

@@ -269,6 +292,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
if ((attr_version != 0 && fi->attr_version > attr_version) ||
test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
spin_unlock(&fi->lock);
+ if (try_wb_update)
+ mutex_unlock(&fi->wb_attr_mutex);
return;
}

@@ -292,7 +317,13 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
truncate_pagecache(inode, attr->size);
if (!fc->explicit_inval_data)
inval = true;
- } else if (fc->auto_inval_data) {
+ } else if (!fc->writeback_cache_v2 && fc->auto_inval_data) {
+ /*
+ * When fc->writeback_cache_v2 is set, the old_mtime
+ * can be generated by kernel and must not equal to
+ * new_mtime generated by server. So skip in such
+ * case.
+ */
struct timespec64 new_mtime = {
.tv_sec = attr->mtime,
.tv_nsec = attr->mtimensec,
@@ -312,6 +343,9 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,

if (IS_ENABLED(CONFIG_FUSE_DAX))
fuse_dax_dontcache(inode, attr->flags);
+
+ if (try_wb_update)
+ mutex_unlock(&fi->wb_attr_mutex);
}

static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr,
@@ -1179,6 +1213,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
fc->async_dio = 1;
if (flags & FUSE_WRITEBACK_CACHE)
fc->writeback_cache = 1;
+ if (flags & FUSE_WRITEBACK_CACHE_V2) {
+ fc->writeback_cache = 1;
+ fc->writeback_cache_v2 = 1;
+ }
if (flags & FUSE_PARALLEL_DIROPS)
fc->parallel_dirops = 1;
if (flags & FUSE_HANDLE_KILLPRIV)
@@ -1262,7 +1300,7 @@ void fuse_send_init(struct fuse_mount *fm)
FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA |
FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT |
FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP |
- FUSE_HAS_EXPIRE_ONLY;
+ FUSE_HAS_EXPIRE_ONLY | FUSE_WRITEBACK_CACHE_V2;
#ifdef CONFIG_FUSE_DAX
if (fm->fc->dax)
flags |= FUSE_MAP_ALIGNMENT;
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 1a24c11637a4..850a3c0f87fb 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -207,6 +207,9 @@
* - add FUSE_EXT_GROUPS
* - add FUSE_CREATE_SUPP_GROUP
* - add FUSE_HAS_EXPIRE_ONLY
+ *
+ * 7.39
+ * - add FUSE_WRITEBACK_CACHE_V2 init flag
*/

#ifndef _LINUX_FUSE_H
@@ -242,7 +245,7 @@
#define FUSE_KERNEL_VERSION 7

/** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 38
+#define FUSE_KERNEL_MINOR_VERSION 39

/** The node ID of the root inode */
#define FUSE_ROOT_ID 1
@@ -373,6 +376,9 @@ struct fuse_file_lock {
* FUSE_CREATE_SUPP_GROUP: add supplementary group info to create, mkdir,
* symlink and mknod (single group that matches parent)
* FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation
+ * FUSE_WRITEBACK_CACHE_V2:
+ * allow time/size to be refreshed if no pending write
+ * c/mtime not updated from kernel to server
*/
#define FUSE_ASYNC_READ (1 << 0)
#define FUSE_POSIX_LOCKS (1 << 1)
@@ -411,6 +417,7 @@ struct fuse_file_lock {
#define FUSE_HAS_INODE_DAX (1ULL << 33)
#define FUSE_CREATE_SUPP_GROUP (1ULL << 34)
#define FUSE_HAS_EXPIRE_ONLY (1ULL << 35)
+#define FUSE_WRITEBACK_CACHE_V2 (1ULL << 36)

/**
* CUSE INIT request/reply flags
--
2.20.1