2018-10-07 23:32:08

by Martin Brandenburg

[permalink] [raw]
Subject: [PATCH 00/19] [V2] orangefs: page cache

V2... see https://marc.info/?l=linux-fsdevel&m=153721507330730&w=2

One important change is the following, without which an unaligned write
may end up written to the beginning of its page. Surprisingly xfstests
did not catch this. This was caught by an invalidate_inode_pages2 call
in read_iter (not part of this patch series) which exposed the bug.

diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 34b98d2ed377..cd1263c45bb2 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -56,7 +54,7 @@ static int orangefs_writepage_locked(struct page *page,

bv.bv_page = page;
bv.bv_len = wlen;
- bv.bv_offset = 0;
+ bv.bv_offset = off % PAGE_SIZE;
iov_iter_bvec(&iter, ITER_BVEC | WRITE, &bv, 1, wlen);

ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen,

The other big thing is an improved releasepage implementation and a
launder_page implementation.

We also size the writepages buffer based on the client core buffer.
However I'm not happy with the memory allocation in writepages.

Mike, I know you've had some trouble with the original series. I'd like
to know if this fixes that.

Martin Brandenburg (19):
orangefs: implement xattr cache
orangefs: do not invalidate attributes on inode create
orangefs: simplify orangefs_inode_getattr interface
orangefs: update attributes rather than relying on server
orangefs: hold i_lock during inode_getattr
orangefs: set up and use backing_dev_info
orangefs: let setattr write to cached inode
orangefs: reorganize setattr functions to track attribute changes
orangefs: remove orangefs_readpages
orangefs: service ops done for writeback are not killable
orangefs: migrate to generic_file_read_iter
orangefs: implement writepage
orangefs: skip inode writeout if nothing to write
orangefs: write range tracking
orangefs: avoid fsync service operation on flush
orangefs: use kmem_cache for orangefs_write_request
orangefs: implement writepages
orangefs: use client-core buffer size to determine writepages count
orangefs: do writepages_work if a single page must be written

fs/orangefs/acl.c | 4 +-
fs/orangefs/file.c | 194 +++--------
fs/orangefs/inode.c | 628 ++++++++++++++++++++++++++++------
fs/orangefs/namei.c | 41 +--
fs/orangefs/orangefs-cache.c | 24 +-
fs/orangefs/orangefs-kernel.h | 56 ++-
fs/orangefs/orangefs-mod.c | 10 +-
fs/orangefs/orangefs-utils.c | 181 +++++-----
fs/orangefs/super.c | 38 +-
fs/orangefs/waitqueue.c | 18 +-
fs/orangefs/xattr.c | 104 ++++++
11 files changed, 893 insertions(+), 405 deletions(-)

--
2.19.0



2018-10-07 23:29:20

by Martin Brandenburg

[permalink] [raw]
Subject: [PATCH 07/19] orangefs: let setattr write to cached inode

This is a fairly big change, but ultimately it's not a lot of code.

Implement write_inode and then avoid the call to orangefs_inode_setattr
within orangefs_setattr.

Signed-off-by: Martin Brandenburg <[email protected]>
---
fs/orangefs/inode.c | 11 +++--------
fs/orangefs/super.c | 16 ++++++++++++++++
2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index ec3996a61f92..b16b11294573 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -207,8 +207,8 @@ static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
*/
int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
{
- int ret = -EINVAL;
struct inode *inode = dentry->d_inode;
+ int ret;

gossip_debug(GOSSIP_INODE_DEBUG,
"%s: called on %pd\n",
@@ -228,16 +228,11 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
setattr_copy(inode, iattr);
mark_inode_dirty(inode);

- ret = orangefs_inode_setattr(inode, iattr);
- gossip_debug(GOSSIP_INODE_DEBUG,
- "%s: orangefs_inode_setattr returned %d\n",
- __func__,
- ret);
-
- if (!ret && (iattr->ia_valid & ATTR_MODE))
+ if (iattr->ia_valid & ATTR_MODE)
/* change mod on a file that has ACLs */
ret = posix_acl_chmod(inode, inode->i_mode);

+ ret = 0;
out:
gossip_debug(GOSSIP_INODE_DEBUG, "%s: ret:%d:\n", __func__, ret);
return ret;
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 61bec955b285..788869c8233b 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -151,6 +151,21 @@ static void orangefs_destroy_inode(struct inode *inode)
call_rcu(&inode->i_rcu, orangefs_i_callback);
}

+int orangefs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ struct iattr iattr;
+ gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_write_inode\n");
+ iattr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_ATIME |
+ ATTR_ATIME_SET | ATTR_MTIME | ATTR_MTIME_SET | ATTR_CTIME;
+ iattr.ia_mode = inode->i_mode;
+ iattr.ia_uid = inode->i_uid;
+ iattr.ia_gid = inode->i_gid;
+ iattr.ia_atime = inode->i_atime;
+ iattr.ia_mtime = inode->i_mtime;
+ iattr.ia_ctime = inode->i_ctime;
+ return orangefs_inode_setattr(inode, &iattr);
+}
+
/*
* NOTE: information filled in here is typically reflected in the
* output of the system command 'df'
@@ -309,6 +324,7 @@ void fsid_key_table_finalize(void)
static const struct super_operations orangefs_s_ops = {
.alloc_inode = orangefs_alloc_inode,
.destroy_inode = orangefs_destroy_inode,
+ .write_inode = orangefs_write_inode,
.drop_inode = generic_delete_inode,
.statfs = orangefs_statfs,
.remount_fs = orangefs_remount_fs,
--
2.19.0


2018-10-07 23:29:20

by Martin Brandenburg

[permalink] [raw]
Subject: [PATCH 19/19] orangefs: do writepages_work if a single page must be written

Otherwise the next page can't possibly be an append and it'll
just sit there and write pages one by one until it flushes the
saved region at the very end.

Signed-off-by: Martin Brandenburg <[email protected]>
---
fs/orangefs/inode.c | 8 ++++++++
1 file changed, 8 insertions(+)

diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 20950f3f758a..cd1263c45bb2 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -307,6 +307,10 @@ static int orangefs_writepages_callback(struct page *page,
wr = (struct orangefs_write_request *)page_private(page);

if (wr->len != PAGE_SIZE) {
+ if (ow->npages) {
+ orangefs_writepages_work(ow, wbc);
+ ow->npages = 0;
+ }
ret = orangefs_writepage_locked(page, wbc);
mapping_set_error(page->mapping, ret);
unlock_page(page);
@@ -335,6 +339,10 @@ static int orangefs_writepages_callback(struct page *page,
}
done:
if (ret == -1) {
+ if (ow->npages) {
+ orangefs_writepages_work(ow, wbc);
+ ow->npages = 0;
+ }
ret = orangefs_writepage_locked(page, wbc);
mapping_set_error(page->mapping, ret);
unlock_page(page);
--
2.19.0


2018-10-07 23:29:20

by Martin Brandenburg

[permalink] [raw]
Subject: [PATCH 02/19] orangefs: do not invalidate attributes on inode create

When an inode is created, we fetch attributes from the server. There is
no need to turn around and invalidate them.

No need to initialize attributes after the getattr either. Either it'll
be exactly the same, or it'll be something else and wrong.

Signed-off-by: Martin Brandenburg <[email protected]>
---
fs/orangefs/inode.c | 6 ------
fs/orangefs/namei.c | 6 ------
2 files changed, 12 deletions(-)

diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index a7a8d3647ffe..2f1a5f36a103 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -459,12 +459,6 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
goto out_iput;

orangefs_init_iops(inode);
-
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
- inode->i_size = PAGE_SIZE;
inode->i_rdev = dev;

error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref);
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 625b0580f9be..46b5f06b7e4c 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -77,8 +77,6 @@ static int orangefs_create(struct inode *dir,

d_instantiate_new(dentry, inode);
orangefs_set_timeout(dentry);
- ORANGEFS_I(inode)->getattr_time = jiffies - 1;
- ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;

gossip_debug(GOSSIP_NAME_DEBUG,
"%s: dentry instantiated for %pd\n",
@@ -292,8 +290,6 @@ static int orangefs_symlink(struct inode *dir,

d_instantiate_new(dentry, inode);
orangefs_set_timeout(dentry);
- ORANGEFS_I(inode)->getattr_time = jiffies - 1;
- ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;

gossip_debug(GOSSIP_NAME_DEBUG,
"Inode (Symlink) %pU -> %pd\n",
@@ -361,8 +357,6 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode

d_instantiate_new(dentry, inode);
orangefs_set_timeout(dentry);
- ORANGEFS_I(inode)->getattr_time = jiffies - 1;
- ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;

gossip_debug(GOSSIP_NAME_DEBUG,
"Inode (Directory) %pU -> %pd\n",
--
2.19.0


2018-10-07 23:29:20

by Martin Brandenburg

[permalink] [raw]
Subject: [PATCH 18/19] orangefs: use client-core buffer size to determine writepages count

The previous fixed count of 128 was arbitrary.

I see about a 10% performance increase on large block size I/O since
the count is now 1024 (given the default four megabyte client-core
buffer).

Signed-off-by: Martin Brandenburg <[email protected]>
---
fs/orangefs/inode.c | 27 ++++++++++++++++++++-------
1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 9a41b7d2ce54..20950f3f758a 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -15,8 +15,6 @@
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"

-#define ORANGEFS_WRITEPAGES_COUNT 128
-
static int orangefs_writepage_locked(struct page *page,
struct writeback_control *wbc)
{
@@ -237,9 +235,10 @@ struct orangefs_writepages {
size_t len;
kuid_t uid;
kgid_t gid;
- struct page *pages[ORANGEFS_WRITEPAGES_COUNT];
+ int maxpages;
int npages;
- struct bio_vec bv[ORANGEFS_WRITEPAGES_COUNT];
+ struct page **pages;
+ struct bio_vec *bv;
};

static int orangefs_writepages_work(struct orangefs_writepages *ow,
@@ -324,7 +323,7 @@ static int orangefs_writepages_callback(struct page *page,
}
if (!uid_eq(ow->uid, wr->uid) || !gid_eq(ow->gid, wr->gid)) {
orangefs_writepages_work(ow, wbc);
- memset(ow, 0, sizeof *ow);
+ ow->npages = 0;
ret = -1;
goto done;
}
@@ -340,9 +339,9 @@ static int orangefs_writepages_callback(struct page *page,
mapping_set_error(page->mapping, ret);
unlock_page(page);
} else {
- if (ow->npages == ORANGEFS_WRITEPAGES_COUNT) {
+ if (ow->npages == ow->maxpages) {
orangefs_writepages_work(ow, wbc);
- memset(ow, 0, sizeof *ow);
+ ow->npages = 0;
}
}
}
@@ -358,6 +357,18 @@ static int orangefs_writepages(struct address_space *mapping,
ow = kzalloc(sizeof(struct orangefs_writepages), GFP_KERNEL);
if (!ow)
return -ENOMEM;
+ ow->maxpages = orangefs_bufmap_size_query()/PAGE_SIZE;
+ ow->pages = kcalloc(ow->maxpages, sizeof(struct page *), GFP_KERNEL);
+ if (!ow->pages) {
+ kfree(ow);
+ return -ENOMEM;
+ }
+ ow->bv = kcalloc(ow->maxpages, sizeof(struct bio_vec), GFP_KERNEL);
+ if (!ow->bv) {
+ kfree(ow->pages);
+ kfree(ow);
+ return -ENOMEM;
+ }
mutex_lock(&ORANGEFS_SB(mapping->host->i_sb)->writepages_mutex);
blk_start_plug(&plug);
ret = write_cache_pages(mapping, wbc, orangefs_writepages_callback, ow);
@@ -365,6 +376,8 @@ static int orangefs_writepages(struct address_space *mapping,
ret = orangefs_writepages_work(ow, wbc);
blk_finish_plug(&plug);
mutex_unlock(&ORANGEFS_SB(mapping->host->i_sb)->writepages_mutex);
+ kfree(ow->pages);
+ kfree(ow->bv);
kfree(ow);
return ret;
}
--
2.19.0


2018-10-07 23:29:20

by Martin Brandenburg

[permalink] [raw]
Subject: [PATCH 17/19] orangefs: implement writepages

Go through pages and look for a consecutive writable region. After
finding 128 consecutive writable pages or when finding a non-consecutive
region, do the write.

Signed-off-by: Martin Brandenburg <[email protected]>
---
fs/orangefs/file.c | 1 +
fs/orangefs/inode.c | 144 +++++++++++++++++++++++++++++++++-
fs/orangefs/orangefs-kernel.h | 1 +
fs/orangefs/super.c | 1 +
4 files changed, 145 insertions(+), 2 deletions(-)

diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index d5ecfea3288a..0d9cef37404f 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -491,6 +491,7 @@ static int orangefs_file_release(struct inode *inode, struct file *file)
gossip_debug(GOSSIP_INODE_DEBUG,
"flush_racache finished\n");
}
+
}
return 0;
}
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index f53d768acdd9..9a41b7d2ce54 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -15,6 +15,8 @@
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"

+#define ORANGEFS_WRITEPAGES_COUNT 128
+
static int orangefs_writepage_locked(struct page *page,
struct writeback_control *wbc)
{
@@ -44,10 +46,10 @@ static int orangefs_writepage_locked(struct page *page,
len = i_size_read(inode);
}
} else {
-/* BUG();*/
+ end_page_writeback(page);
/* It's not private so there's nothing to write, right? */
printk("writepage not private!\n");
- end_page_writeback(page);
+ BUG();
return 0;

}
@@ -230,6 +232,143 @@ static int orangefs_readpage(struct file *file, struct page *page)
return ret;
}

+struct orangefs_writepages {
+ loff_t off;
+ size_t len;
+ kuid_t uid;
+ kgid_t gid;
+ struct page *pages[ORANGEFS_WRITEPAGES_COUNT];
+ int npages;
+ struct bio_vec bv[ORANGEFS_WRITEPAGES_COUNT];
+};
+
+static int orangefs_writepages_work(struct orangefs_writepages *ow,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = ow->pages[0]->mapping->host;
+ struct orangefs_write_request *wrp, wr;
+ struct iov_iter iter;
+ ssize_t ret;
+ loff_t off;
+ int i;
+
+ for (i = 0; i < ow->npages; i++) {
+ set_page_writeback(ow->pages[i]);
+ ow->bv[i].bv_page = ow->pages[i];
+ /* uh except the last one maybe... */
+ if (i == ow->npages - 1 && ow->len % PAGE_SIZE)
+ ow->bv[i].bv_len = ow->len % PAGE_SIZE;
+ else
+ ow->bv[i].bv_len = PAGE_SIZE;
+ ow->bv[i].bv_offset = 0;
+ }
+ iov_iter_bvec(&iter, ITER_BVEC | WRITE, ow->bv, ow->npages, ow->len);
+
+ off = ow->off;
+ wr.uid = ow->uid;
+ wr.gid = ow->gid;
+ ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, ow->len,
+ 0, &wr);
+ if (ret < 0) {
+ for (i = 0; i < ow->npages; i++) {
+ SetPageError(ow->pages[i]);
+ mapping_set_error(ow->pages[i]->mapping, ret);
+ end_page_writeback(ow->pages[i]);
+ unlock_page(ow->pages[i]);
+ }
+ } else {
+ for (i = 0; i < ow->npages; i++) {
+ if (PagePrivate(ow->pages[i])) {
+ wrp = (struct orangefs_write_request *)
+ page_private(ow->pages[i]);
+ ClearPagePrivate(ow->pages[i]);
+ wr_release(wrp);
+ }
+ end_page_writeback(ow->pages[i]);
+ unlock_page(ow->pages[i]);
+ }
+ }
+ return ret;
+}
+
+static int orangefs_writepages_callback(struct page *page,
+ struct writeback_control *wbc, void *data)
+{
+ struct orangefs_writepages *ow = data;
+ struct orangefs_write_request *wr;
+ int ret;
+
+ if (!PagePrivate(page)) {
+ unlock_page(page);
+ /* It's not private so there's nothing to write, right? */
+ printk("writepages_callback not private!\n");
+ BUG();
+ return 0;
+ }
+ wr = (struct orangefs_write_request *)page_private(page);
+
+ if (wr->len != PAGE_SIZE) {
+ ret = orangefs_writepage_locked(page, wbc);
+ mapping_set_error(page->mapping, ret);
+ unlock_page(page);
+ } else {
+ ret = -1;
+ if (ow->npages == 0) {
+ ow->off = wr->pos;
+ ow->len = wr->len;
+ ow->uid = wr->uid;
+ ow->gid = wr->gid;
+ ow->pages[ow->npages++] = page;
+ ret = 0;
+ goto done;
+ }
+ if (!uid_eq(ow->uid, wr->uid) || !gid_eq(ow->gid, wr->gid)) {
+ orangefs_writepages_work(ow, wbc);
+ memset(ow, 0, sizeof *ow);
+ ret = -1;
+ goto done;
+ }
+ if (ow->off + ow->len == wr->pos) {
+ ow->len += wr->len;
+ ow->pages[ow->npages++] = page;
+ ret = 0;
+ goto done;
+ }
+done:
+ if (ret == -1) {
+ ret = orangefs_writepage_locked(page, wbc);
+ mapping_set_error(page->mapping, ret);
+ unlock_page(page);
+ } else {
+ if (ow->npages == ORANGEFS_WRITEPAGES_COUNT) {
+ orangefs_writepages_work(ow, wbc);
+ memset(ow, 0, sizeof *ow);
+ }
+ }
+ }
+ return ret;
+}
+
+static int orangefs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct orangefs_writepages *ow;
+ struct blk_plug plug;
+ int ret;
+ ow = kzalloc(sizeof(struct orangefs_writepages), GFP_KERNEL);
+ if (!ow)
+ return -ENOMEM;
+ mutex_lock(&ORANGEFS_SB(mapping->host->i_sb)->writepages_mutex);
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, orangefs_writepages_callback, ow);
+ if (ow->npages)
+ ret = orangefs_writepages_work(ow, wbc);
+ blk_finish_plug(&plug);
+ mutex_unlock(&ORANGEFS_SB(mapping->host->i_sb)->writepages_mutex);
+ kfree(ow);
+ return ret;
+}
+
static int orangefs_write_begin(struct file *file,
struct address_space *mapping, loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -348,6 +487,7 @@ static int orangefs_launder_page(struct page *page)
static const struct address_space_operations orangefs_address_operations = {
.writepage = orangefs_writepage,
.readpage = orangefs_readpage,
+ .writepages = orangefs_writepages,
.set_page_dirty = __set_page_dirty_nobuffers,
.write_begin = orangefs_write_begin,
.write_end = orangefs_write_end,
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 256851bab7a5..9e23f97fb5cc 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -220,6 +220,7 @@ struct orangefs_sb_info_s {
int mount_pending;
int no_list;
struct list_head list;
+ struct mutex writepages_mutex;
};

struct orangefs_stats {
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 83abe5ec2d11..204e1ac7f228 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -467,6 +467,7 @@ static int orangefs_fill_sb(struct super_block *sb,

sb->s_export_op = &orangefs_export_ops;
sb->s_root = root_dentry;
+ mutex_init(&ORANGEFS_SB(sb)->writepages_mutex);
return 0;
}

--
2.19.0


2018-10-07 23:29:41

by Martin Brandenburg

[permalink] [raw]
Subject: [PATCH 13/19] orangefs: skip inode writeout if nothing to write

Would happen if an inode is dirty but whatever happened is not something
that can be written out to OrangeFS.

Signed-off-by: Martin Brandenburg <[email protected]>
---
fs/orangefs/orangefs-utils.c | 5 +++++
1 file changed, 5 insertions(+)

diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 902ebd1599e1..de63bb710e38 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -435,6 +435,11 @@ int orangefs_inode_setattr(struct inode *inode)
copy_attributes_from_inode(inode,
&new_op->upcall.req.setattr.attributes);
orangefs_inode->attr_valid = 0;
+ if (!new_op->upcall.req.setattr.attributes.mask) {
+ spin_unlock(&inode->i_lock);
+ op_release(new_op);
+ return 0;
+ }
spin_unlock(&inode->i_lock);

ret = service_operation(new_op, __func__,
--
2.19.0


2018-10-07 23:29:44

by Martin Brandenburg

[permalink] [raw]
Subject: [PATCH 15/19] orangefs: avoid fsync service operation on flush

Without this, an fsync call is sent to the server even if no data
changed. This resulted in a rather severe (50%) performance regression
under certain metadata-heavy workloads.

In the past, everything was direct IO. Nothing happend on a close call.
An explicit fsync call would send an fsync request to the server which
in turn fsynced the underlying file.

Now there are cached writes. Then fsync began writing out dirty pages
in addition to making an fsync request to the server, and close began
calling fsync.

With this commit, close only writes out dirty pages, and does not make
the fsync request.

Signed-off-by: Martin Brandenburg <[email protected]>
---
fs/orangefs/file.c | 19 ++++++++++++++++++-
1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 5eda483263ae..d5ecfea3288a 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -596,7 +596,24 @@ static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl)

int orangefs_flush(struct file *file, fl_owner_t id)
{
- return vfs_fsync(file, 0);
+ /*
+ * This is vfs_fsync_range(file, 0, LLONG_MAX, 0) without the
+ * service_operation in orangefs_fsync.
+ *
+ * Do not send fsync to OrangeFS server on a close. Do send fsync
+ * on an explicit fsync call. This duplicates historical OrangeFS
+ * behavior.
+ */
+ struct inode *inode = file->f_mapping->host;
+
+ if (inode->i_state & I_DIRTY_TIME) {
+ spin_lock(&inode->i_lock);
+ inode->i_state &= ~I_DIRTY_TIME;
+ spin_unlock(&inode->i_lock);
+ mark_inode_dirty_sync(inode);
+ }
+
+ return filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX);
}

/** ORANGEFS implementation of VFS file operations */
--
2.19.0


2018-10-07 23:29:47

by Martin Brandenburg

[permalink] [raw]
Subject: [PATCH 12/19] orangefs: implement writepage

Now orangefs_inode_getattr fills from cache if an inode has dirty pages.

also if attr_valid and dirty pages and !flags, we spin on inode writeback
before returning if pages still dirty after: should it be other way

Signed-off-by: Martin Brandenburg <[email protected]>
---
fs/orangefs/file.c | 77 +++++++-----------------------------
fs/orangefs/inode.c | 59 ++++++++++++++++++++++++---
fs/orangefs/orangefs-utils.c | 12 +++++-
3 files changed, 78 insertions(+), 70 deletions(-)

diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 4bc06c310e02..ba580a5c6fd2 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* (C) 2001 Clemson University and The University of Chicago
+ * Copyright 2018 Omnibond Systems, L.L.C.
*
* See COPYING in top-level directory.
*/
@@ -348,65 +349,11 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb,
return generic_file_read_iter(iocb, iter);
}

-static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+static ssize_t orangefs_file_write_iter(struct kiocb *iocb,
+ struct iov_iter *iter)
{
- struct file *file = iocb->ki_filp;
- loff_t pos;
- ssize_t rc;
-
- BUG_ON(iocb->private);
-
- truncate_inode_pages(file->f_mapping, 0);
-
- gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n");
-
- inode_lock(file->f_mapping->host);
-
- /* Make sure generic_write_checks sees an up to date inode size. */
- if (file->f_flags & O_APPEND) {
- rc = orangefs_inode_getattr(file->f_mapping->host,
- ORANGEFS_GETATTR_SIZE);
- if (rc == -ESTALE)
- rc = -EIO;
- if (rc) {
- gossip_err("%s: orangefs_inode_getattr failed, "
- "rc:%zd:.\n", __func__, rc);
- goto out;
- }
- }
-
- rc = generic_write_checks(iocb, iter);
-
- if (rc <= 0) {
- gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
- __func__, rc);
- goto out;
- }
-
- /*
- * if we are appending, generic_write_checks would have updated
- * pos to the end of the file, so we will wait till now to set
- * pos...
- */
- pos = iocb->ki_pos;
-
- rc = do_readv_writev(ORANGEFS_IO_WRITE,
- file,
- &pos,
- iter);
- if (rc < 0) {
- gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
- __func__, rc);
- goto out;
- }
-
- iocb->ki_pos = pos;
orangefs_stats.writes++;
-
-out:
-
- inode_unlock(file->f_mapping->host);
- return rc;
+ return generic_file_write_iter(iocb, iter);
}

/*
@@ -501,9 +448,6 @@ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
(char *)file->f_path.dentry->d_name.name :
(char *)"Unknown"));

- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
- return -EINVAL;
-
/* set the sequential readahead hint */
vma->vm_flags |= VM_SEQ_READ;
vma->vm_flags &= ~VM_RAND_READ;
@@ -543,8 +487,6 @@ static int orangefs_file_release(struct inode *inode, struct file *file)
gossip_debug(GOSSIP_INODE_DEBUG,
"flush_racache finished\n");
}
- truncate_inode_pages(file_inode(file)->i_mapping,
- 0);
}
return 0;
}
@@ -562,6 +504,11 @@ static int orangefs_fsync(struct file *file,
ORANGEFS_I(file_inode(file));
struct orangefs_kernel_op_s *new_op = NULL;

+ ret = filemap_write_and_wait_range(file_inode(file)->i_mapping,
+ start, end);
+ if (ret)
+ return ret;
+
new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC);
if (!new_op)
return -ENOMEM;
@@ -643,6 +590,11 @@ static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl)
return rc;
}

+int orangefs_flush(struct file *file, fl_owner_t id)
+{
+ return vfs_fsync(file, 0);
+}
+
/** ORANGEFS implementation of VFS file operations */
const struct file_operations orangefs_file_operations = {
.llseek = orangefs_file_llseek,
@@ -652,6 +604,7 @@ const struct file_operations orangefs_file_operations = {
.unlocked_ioctl = orangefs_ioctl,
.mmap = orangefs_file_mmap,
.open = generic_file_open,
+ .flush = orangefs_flush,
.release = orangefs_file_release,
.fsync = orangefs_fsync,
};
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 826054b979cc..bd2ce18453f2 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -15,6 +15,44 @@
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"

+static int orangefs_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ struct iov_iter iter;
+ struct bio_vec bv;
+ size_t len, wlen;
+ ssize_t ret;
+ loff_t off;
+
+ set_page_writeback(page);
+
+ off = page_offset(page);
+ len = i_size_read(inode);
+ if (off + PAGE_SIZE > len)
+ wlen = len - off;
+ else
+ wlen = PAGE_SIZE;
+
+ bv.bv_page = page;
+ bv.bv_len = wlen;
+ bv.bv_offset = off % PAGE_SIZE;
+ if (wlen == 0)
+ dump_stack();
+ iov_iter_bvec(&iter, ITER_BVEC | WRITE, &bv, 1, wlen);
+
+ ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen,
+ len);
+ if (ret < 0) {
+ SetPageError(page);
+ mapping_set_error(page->mapping, ret);
+ } else {
+ ret = 0;
+ }
+ end_page_writeback(page);
+ unlock_page(page);
+ return ret;
+}
+
static int orangefs_readpage(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
@@ -48,6 +86,15 @@ static int orangefs_readpage(struct file *file, struct page *page)
return ret;
}

+int orangefs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata)
+{
+ int r;
+ r = simple_write_end(file, mapping, pos, len, copied, page, fsdata);
+ mark_inode_dirty_sync(file_inode(file));
+ return r;
+}
+
static void orangefs_invalidatepage(struct page *page,
unsigned int offset,
unsigned int length)
@@ -77,17 +124,17 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
{
struct file *file = iocb->ki_filp;
loff_t pos = *(&iocb->ki_pos);
- /*
- * This cannot happen until write_iter becomes
- * generic_file_write_iter.
- */
- BUG_ON(iov_iter_rw(iter) != READ);
- return do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter);
+ return do_readv_writev(iov_iter_rw(iter) == WRITE ?
+ ORANGEFS_IO_WRITE : ORANGEFS_IO_READ, file, &pos, iter);
}

/** ORANGEFS2 implementation of address space operations */
static const struct address_space_operations orangefs_address_operations = {
+ .writepage = orangefs_writepage,
.readpage = orangefs_readpage,
+ .set_page_dirty = __set_page_dirty_nobuffers,
+ .write_begin = simple_write_begin,
+ .write_end = orangefs_write_end,
.invalidatepage = orangefs_invalidatepage,
.releasepage = orangefs_releasepage,
.direct_IO = orangefs_direct_IO,
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 9221c4a3398e..902ebd1599e1 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -247,12 +247,16 @@ int orangefs_inode_getattr(struct inode *inode, int flags)
spin_lock(&inode->i_lock);
/* Must have all the attributes in the mask and be within cache time. */
if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) ||
- orangefs_inode->attr_valid) {
+ orangefs_inode->attr_valid || inode->i_state & I_DIRTY_PAGES) {
if (orangefs_inode->attr_valid) {
spin_unlock(&inode->i_lock);
write_inode_now(inode, 1);
goto again;
}
+ if (inode->i_state & I_DIRTY_PAGES) {
+ spin_unlock(&inode->i_lock);
+ return 0;
+ }
spin_unlock(&inode->i_lock);
return 0;
}
@@ -281,12 +285,16 @@ int orangefs_inode_getattr(struct inode *inode, int flags)
spin_lock(&inode->i_lock);
/* Must have all the attributes in the mask and be within cache time. */
if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) ||
- orangefs_inode->attr_valid) {
+ orangefs_inode->attr_valid || inode->i_state & I_DIRTY_PAGES) {
if (orangefs_inode->attr_valid) {
spin_unlock(&inode->i_lock);
write_inode_now(inode, 1);
goto again2;
}
+ if (inode->i_state & I_DIRTY_PAGES) {
+ spin_unlock(&inode->i_lock);
+ return 0;
+ }
gossip_debug(GOSSIP_UTILS_DEBUG, "%s: in cache or dirty\n",
__func__);
ret = 0;
--
2.19.0


2018-10-07 23:29:59

by Martin Brandenburg

[permalink] [raw]
Subject: [PATCH 11/19] orangefs: migrate to generic_file_read_iter

Remove orangefs_inode_read. It was used by readpage. Calling
wait_for_direct_io directly serves the purpose just as well. There is
now no check of the bufmap size in the readpage path. There are already
other places the bufmap size is assumed to be greater than PAGE_SIZE.

Important to call truncate_inode_pages now in the write path so a
subsequent read sees the new data.

Signed-off-by: Martin Brandenburg <[email protected]>
---
fs/orangefs/file.c | 68 ++++-------------------------------
fs/orangefs/inode.c | 63 ++++++++++++--------------------
fs/orangefs/orangefs-kernel.h | 13 ++++---
3 files changed, 38 insertions(+), 106 deletions(-)

diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index aec17635a50f..4bc06c310e02 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -44,7 +44,7 @@ static int flush_racache(struct inode *inode)
/*
* Post and wait for the I/O upcall to finish
*/
-static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
+ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
loff_t *offset, struct iov_iter *iter,
size_t total_size, loff_t readahead_size)
{
@@ -240,7 +240,7 @@ static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inod
* augmented/extended metadata attached to the file.
* Note: File extended attributes override any mount options.
*/
-static ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file,
+ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file,
loff_t *offset, struct iov_iter *iter)
{
struct inode *inode = file->f_mapping->host;
@@ -341,67 +341,11 @@ static ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file,
return ret;
}

-/*
- * Read data from a specified offset in a file (referenced by inode).
- * Data may be placed either in a user or kernel buffer.
- */
-ssize_t orangefs_inode_read(struct inode *inode,
- struct iov_iter *iter,
- loff_t *offset,
- loff_t readahead_size)
+static ssize_t orangefs_file_read_iter(struct kiocb *iocb,
+ struct iov_iter *iter)
{
- struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
- size_t count = iov_iter_count(iter);
- size_t bufmap_size;
- ssize_t ret = -EINVAL;
-
orangefs_stats.reads++;
-
- bufmap_size = orangefs_bufmap_size_query();
- if (count > bufmap_size) {
- gossip_debug(GOSSIP_FILE_DEBUG,
- "%s: count is too large (%zd/%zd)!\n",
- __func__, count, bufmap_size);
- return -EINVAL;
- }
-
- gossip_debug(GOSSIP_FILE_DEBUG,
- "%s(%pU) %zd@%llu\n",
- __func__,
- &orangefs_inode->refn.khandle,
- count,
- llu(*offset));
-
- ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, offset, iter,
- count, readahead_size);
- if (ret > 0)
- *offset += ret;
-
- gossip_debug(GOSSIP_FILE_DEBUG,
- "%s(%pU): Value(%zd) returned.\n",
- __func__,
- &orangefs_inode->refn.khandle,
- ret);
-
- return ret;
-}
-
-static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- loff_t pos = iocb->ki_pos;
- ssize_t rc = 0;
-
- BUG_ON(iocb->private);
-
- gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n");
-
- orangefs_stats.reads++;
-
- rc = do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter);
- iocb->ki_pos = pos;
-
- return rc;
+ return generic_file_read_iter(iocb, iter);
}

static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
@@ -412,6 +356,8 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite

BUG_ON(iocb->private);

+ truncate_inode_pages(file->f_mapping, 0);
+
gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n");

inode_lock(file->f_mapping->host);
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 1ef000b69e06..826054b979cc 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -17,37 +17,25 @@

static int orangefs_readpage(struct file *file, struct page *page)
{
- int ret;
- int max_block;
- ssize_t bytes_read = 0;
struct inode *inode = page->mapping->host;
- const __u32 blocksize = PAGE_SIZE;
- const __u32 blockbits = PAGE_SHIFT;
- struct iov_iter to;
- struct bio_vec bv = {.bv_page = page, .bv_len = PAGE_SIZE};
-
- iov_iter_bvec(&to, ITER_BVEC | READ, &bv, 1, PAGE_SIZE);
-
- gossip_debug(GOSSIP_INODE_DEBUG,
- "orangefs_readpage called with page %p\n",
- page);
-
- max_block = ((inode->i_size / blocksize) + 1);
-
- if (page->index < max_block) {
- loff_t blockptr_offset = (((loff_t) page->index) << blockbits);
-
- bytes_read = orangefs_inode_read(inode,
- &to,
- &blockptr_offset,
- inode->i_size);
- }
+ struct iov_iter iter;
+ struct bio_vec bv;
+ ssize_t ret;
+ loff_t off;
+
+ off = page_offset(page);
+ bv.bv_page = page;
+ bv.bv_len = PAGE_SIZE;
+ bv.bv_offset = 0;
+ iov_iter_bvec(&iter, ITER_BVEC | READ, &bv, 1, PAGE_SIZE);
+
+ ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
+ PAGE_SIZE, inode->i_size);
/* this will only zero remaining unread portions of the page data */
- iov_iter_zero(~0U, &to);
+ iov_iter_zero(~0U, &iter);
/* takes care of potential aliasing */
flush_dcache_page(page);
- if (bytes_read < 0) {
- ret = bytes_read;
+ if (ret < 0) {
SetPageError(page);
} else {
SetPageUptodate(page);
@@ -84,22 +72,17 @@ static int orangefs_releasepage(struct page *page, gfp_t foo)
return 0;
}

-/*
- * Having a direct_IO entry point in the address_space_operations
- * struct causes the kernel to allows us to use O_DIRECT on
- * open. Nothing will ever call this thing, but in the future we
- * will need to be able to use O_DIRECT on open in order to support
- * AIO. Modeled after NFS, they do this too.
- */
-
static ssize_t orangefs_direct_IO(struct kiocb *iocb,
struct iov_iter *iter)
{
- gossip_debug(GOSSIP_INODE_DEBUG,
- "orangefs_direct_IO: %pD\n",
- iocb->ki_filp);
-
- return -EINVAL;
+ struct file *file = iocb->ki_filp;
+ loff_t pos = *(&iocb->ki_pos);
+ /*
+ * This cannot happen until write_iter becomes
+ * generic_file_write_iter.
+ */
+ BUG_ON(iov_iter_rw(iter) != READ);
+ return do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter);
}

/** ORANGEFS2 implementation of address space operations */
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index cdb08e51a4b1..e128500e33b4 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -368,11 +368,6 @@ ssize_t orangefs_listxattr(struct dentry *dentry, char *buffer, size_t size);
struct inode *orangefs_iget(struct super_block *sb,
struct orangefs_object_kref *ref);

-ssize_t orangefs_inode_read(struct inode *inode,
- struct iov_iter *iter,
- loff_t *offset,
- loff_t readahead_size);
-
/*
* defined in devorangefs-req.c
*/
@@ -383,6 +378,14 @@ void orangefs_dev_cleanup(void);
int is_daemon_in_service(void);
bool __is_daemon_in_service(void);

+/*
+ * defined in file.c
+ */
+ssize_t wait_for_direct_io(enum ORANGEFS_io_type, struct inode *, loff_t *,
+ struct iov_iter *, size_t, loff_t);
+ssize_t do_readv_writev(enum ORANGEFS_io_type, struct file *, loff_t *,
+ struct iov_iter *);
+
/*
* defined in orangefs-utils.c
*/
--
2.19.0


2018-10-07 23:30:05

by Martin Brandenburg

[permalink] [raw]
Subject: [PATCH 09/19] orangefs: remove orangefs_readpages

It's a copy of the loop which would run in read_pages from
mm/readahead.c.

Signed-off-by: Martin Brandenburg <[email protected]>
---
fs/orangefs/inode.c | 39 +--------------------------------------
1 file changed, 1 insertion(+), 38 deletions(-)

diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index cf0811ef0e93..1ef000b69e06 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -15,7 +15,7 @@
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"

-static int read_one_page(struct page *page)
+static int orangefs_readpage(struct file *file, struct page *page)
{
int ret;
int max_block;
@@ -60,42 +60,6 @@ static int read_one_page(struct page *page)
return ret;
}

-static int orangefs_readpage(struct file *file, struct page *page)
-{
- return read_one_page(page);
-}
-
-static int orangefs_readpages(struct file *file,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned nr_pages)
-{
- int page_idx;
- int ret;
-
- gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_readpages called\n");
-
- for (page_idx = 0; page_idx < nr_pages; page_idx++) {
- struct page *page;
-
- page = list_entry(pages->prev, struct page, lru);
- list_del(&page->lru);
- if (!add_to_page_cache(page,
- mapping,
- page->index,
- readahead_gfp_mask(mapping))) {
- ret = read_one_page(page);
- gossip_debug(GOSSIP_INODE_DEBUG,
- "failure adding page to cache, read_one_page returned: %d\n",
- ret);
- } else {
- put_page(page);
- }
- }
- BUG_ON(!list_empty(pages));
- return 0;
-}
-
static void orangefs_invalidatepage(struct page *page,
unsigned int offset,
unsigned int length)
@@ -141,7 +105,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
/** ORANGEFS2 implementation of address space operations */
static const struct address_space_operations orangefs_address_operations = {
.readpage = orangefs_readpage,
- .readpages = orangefs_readpages,
.invalidatepage = orangefs_invalidatepage,
.releasepage = orangefs_releasepage,
.direct_IO = orangefs_direct_IO,
--
2.19.0


2018-10-07 23:30:12

by Martin Brandenburg

[permalink] [raw]
Subject: [PATCH 08/19] orangefs: reorganize setattr functions to track attribute changes

OrangeFS accepts a mask indicating which attributes were changed. The
kernel must not set any bits except those that were actually changed.
The kernel must set the uid/gid of the request to the actual uid/gid
responsible for the change.

Code path for notify_change initiated setattrs is

orangefs_setattr(dentry, iattr)
-> __orangefs_setattr(inode, iattr)

In kernel changes are initiated by calling __orangefs_setattr.

Code path for writeback is

orangefs_write_inode
-> orangefs_inode_setattr

attr_valid and attr_uid and attr_gid change together under i_lock.
I_DIRTY changes separately.

__orangefs_setattr
lock
if needs to be cleaned first, unlock and retry
set attr_valid
copy data in
unlock
mark_inode_dirty

orangefs_inode_setattr
lock
copy attributes out
unlock
clear getattr_time
# __writeback_single_inode clears dirty

orangefs_inode_getattr
# possible to get here with attr_valid set and not dirty
lock
if getattr_time ok or attr_valid set, unlock and return
unlock
do server operation
# another thread may getattr or setattr, so check for that
lock
if getattr_time ok or attr_valid, unlock and return
else, copy in
update getattr_time
unlock

Signed-off-by: Martin Brandenburg <[email protected]>
---
fs/orangefs/acl.c | 4 +-
fs/orangefs/inode.c | 76 ++++++++++++++++++-----
fs/orangefs/namei.c | 36 +++++------
fs/orangefs/orangefs-kernel.h | 8 ++-
fs/orangefs/orangefs-utils.c | 114 +++++++++++++---------------------
fs/orangefs/super.c | 11 +---
6 files changed, 130 insertions(+), 119 deletions(-)

diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index 10587413b20e..62e6ab78a915 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -142,7 +142,7 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
rc = __orangefs_set_acl(inode, acl, type);
} else {
iattr.ia_valid = ATTR_MODE;
- rc = orangefs_inode_setattr(inode, &iattr);
+ rc = __orangefs_setattr(inode, &iattr);
}

return rc;
@@ -181,7 +181,7 @@ int orangefs_init_acl(struct inode *inode, struct inode *dir)
inode->i_mode = mode;
iattr.ia_mode = mode;
iattr.ia_valid |= ATTR_MODE;
- orangefs_inode_setattr(inode, &iattr);
+ __orangefs_setattr(inode, &iattr);
}

return error;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index b16b11294573..cf0811ef0e93 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* (C) 2001 Clemson University and The University of Chicago
+ * Copyright 2018 Omnibond Systems, L.L.C.
*
* See COPYING in top-level directory.
*/
@@ -202,22 +203,31 @@ static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
return ret;
}

-/*
- * Change attributes of an object referenced by dentry.
- */
-int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
+int __orangefs_setattr(struct inode *inode, struct iattr *iattr)
{
- struct inode *inode = dentry->d_inode;
int ret;

- gossip_debug(GOSSIP_INODE_DEBUG,
- "%s: called on %pd\n",
- __func__,
- dentry);
-
- ret = setattr_prepare(dentry, iattr);
- if (ret)
- goto out;
+ if (iattr->ia_valid & ATTR_MODE) {
+ if (iattr->ia_mode & (S_ISVTX)) {
+ if (is_root_handle(inode)) {
+ /*
+ * allow sticky bit to be set on root (since
+ * it shows up that way by default anyhow),
+ * but don't show it to the server
+ */
+ iattr->ia_mode -= S_ISVTX;
+ } else {
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "User attempted to set sticky bit on non-root directory; returning EINVAL.\n");
+ return -EINVAL;
+ }
+ }
+ if (iattr->ia_mode & (S_ISUID)) {
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "Attempting to set setuid bit (not supported); returning EINVAL.\n");
+ return -EINVAL;
+ }
+ }

if (iattr->ia_valid & ATTR_SIZE) {
ret = orangefs_setattr_size(inode, iattr);
@@ -225,7 +235,24 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
goto out;
}

+again:
+ spin_lock(&inode->i_lock);
+ if (ORANGEFS_I(inode)->attr_valid) {
+ if (uid_eq(ORANGEFS_I(inode)->attr_uid, current_fsuid()) &&
+ gid_eq(ORANGEFS_I(inode)->attr_gid, current_fsgid())) {
+ ORANGEFS_I(inode)->attr_valid = iattr->ia_valid;
+ } else {
+ spin_unlock(&inode->i_lock);
+ write_inode_now(inode, 1);
+ goto again;
+ }
+ } else {
+ ORANGEFS_I(inode)->attr_valid = iattr->ia_valid;
+ ORANGEFS_I(inode)->attr_uid = current_fsuid();
+ ORANGEFS_I(inode)->attr_gid = current_fsgid();
+ }
setattr_copy(inode, iattr);
+ spin_unlock(&inode->i_lock);
mark_inode_dirty(inode);

if (iattr->ia_valid & ATTR_MODE)
@@ -234,7 +261,25 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)

ret = 0;
out:
- gossip_debug(GOSSIP_INODE_DEBUG, "%s: ret:%d:\n", __func__, ret);
+ return ret;
+}
+
+/*
+ * Change attributes of an object referenced by dentry.
+ */
+int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+ int ret;
+ gossip_debug(GOSSIP_INODE_DEBUG, "__orangefs_setattr: called on %pd\n",
+ dentry);
+ ret = setattr_prepare(dentry, iattr);
+ if (ret)
+ goto out;
+ ret = __orangefs_setattr(d_inode(dentry), iattr);
+ sync_inode_metadata(d_inode(dentry), 1);
+out:
+ gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n",
+ ret);
return ret;
}

@@ -303,7 +348,7 @@ int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags
iattr.ia_valid |= ATTR_CTIME;
if (flags & S_MTIME)
iattr.ia_valid |= ATTR_MTIME;
- return orangefs_inode_setattr(inode, &iattr);
+ return __orangefs_setattr(inode, &iattr);
}

/* ORANGEFS2 implementation of VFS inode operations for files */
@@ -363,6 +408,7 @@ static int orangefs_set_inode(struct inode *inode, void *data)
struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data;
ORANGEFS_I(inode)->refn.fs_id = ref->fs_id;
ORANGEFS_I(inode)->refn.khandle = ref->khandle;
+ ORANGEFS_I(inode)->attr_valid = 0;
hash_init(ORANGEFS_I(inode)->xattr_cache);
return 0;
}
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 7b82fc09291c..16e7f01e4c22 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -83,11 +83,10 @@ static int orangefs_create(struct inode *dir,
__func__,
dentry);

- dir->i_mtime = dir->i_ctime = current_time(dir);
memset(&iattr, 0, sizeof iattr);
- iattr.ia_valid |= ATTR_MTIME;
- orangefs_inode_setattr(dir, &iattr);
- mark_inode_dirty_sync(dir);
+ iattr.ia_valid |= ATTR_MTIME | ATTR_CTIME;
+ iattr.ia_mtime = iattr.ia_ctime = current_time(dir);
+ __orangefs_setattr(dir, &iattr);
ret = 0;
out:
gossip_debug(GOSSIP_NAME_DEBUG,
@@ -208,11 +207,11 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry)
if (!ret) {
drop_nlink(inode);

- dir->i_mtime = dir->i_ctime = current_time(dir);
memset(&iattr, 0, sizeof iattr);
- iattr.ia_valid |= ATTR_MTIME;
- orangefs_inode_setattr(dir, &iattr);
- mark_inode_dirty_sync(dir);
+ iattr.ia_valid |= ATTR_MTIME | ATTR_CTIME;
+ iattr.ia_mtime = iattr.ia_ctime = current_time(dir);
+ __orangefs_setattr(dir, &iattr);
+ ret = 0;
}
return ret;
}
@@ -296,11 +295,10 @@ static int orangefs_symlink(struct inode *dir,
get_khandle_from_ino(inode),
dentry);

- dir->i_mtime = dir->i_ctime = current_time(dir);
memset(&iattr, 0, sizeof iattr);
- iattr.ia_valid |= ATTR_MTIME;
- orangefs_inode_setattr(dir, &iattr);
- mark_inode_dirty_sync(dir);
+ iattr.ia_valid |= ATTR_MTIME | ATTR_CTIME;
+ iattr.ia_mtime = iattr.ia_ctime = current_time(dir);
+ __orangefs_setattr(dir, &iattr);
ret = 0;
out:
return ret;
@@ -367,11 +365,10 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
* NOTE: we have no good way to keep nlink consistent for directories
* across clients; keep constant at 1.
*/
- dir->i_mtime = dir->i_ctime = current_time(dir);
memset(&iattr, 0, sizeof iattr);
- iattr.ia_valid |= ATTR_MTIME;
- orangefs_inode_setattr(dir, &iattr);
- mark_inode_dirty_sync(dir);
+ iattr.ia_valid |= ATTR_MTIME | ATTR_CTIME;
+ iattr.ia_mtime = iattr.ia_ctime = current_time(dir);
+ __orangefs_setattr(dir, &iattr);
out:
return ret;
}
@@ -393,11 +390,10 @@ static int orangefs_rename(struct inode *old_dir,
"orangefs_rename: called (%pd2 => %pd2) ct=%d\n",
old_dentry, new_dentry, d_count(new_dentry));

- new_dir->i_mtime = new_dir->i_ctime = current_time(new_dir);
memset(&iattr, 0, sizeof iattr);
- iattr.ia_valid |= ATTR_MTIME;
- orangefs_inode_setattr(new_dir, &iattr);
- mark_inode_dirty_sync(new_dir);
+ iattr.ia_valid |= ATTR_MTIME | ATTR_CTIME;
+ iattr.ia_mtime = iattr.ia_ctime = current_time(new_dir);
+ __orangefs_setattr(new_dir, &iattr);

new_op = op_alloc(ORANGEFS_VFS_OP_RENAME);
if (!new_op)
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 6b21ec59738c..9fe60d086e2d 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -192,6 +192,9 @@ struct orangefs_inode_s {
sector_t last_failed_block_index_read;

unsigned long getattr_time;
+ int attr_valid;
+ kuid_t attr_uid;
+ kgid_t attr_gid;

DECLARE_HASHTABLE(xattr_cache, 4);
};
@@ -344,7 +347,8 @@ struct inode *orangefs_new_inode(struct super_block *sb,
dev_t dev,
struct orangefs_object_kref *ref);

-int orangefs_setattr(struct dentry *dentry, struct iattr *iattr);
+int __orangefs_setattr(struct inode *, struct iattr *);
+int orangefs_setattr(struct dentry *, struct iattr *);

int orangefs_getattr(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags);
@@ -402,7 +406,7 @@ int orangefs_inode_getattr(struct inode *, int);

int orangefs_inode_check_changed(struct inode *inode);

-int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr);
+int orangefs_inode_setattr(struct inode *inode);

bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op);

diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index d44cbe96719a..a4fac527f85d 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -136,51 +136,37 @@ static int orangefs_inode_perms(struct ORANGEFS_sys_attr_s *attrs)
* NOTE: in kernel land, we never use the sys_attr->link_target for
* anything, so don't bother copying it into the sys_attr object here.
*/
-static inline int copy_attributes_from_inode(struct inode *inode,
- struct ORANGEFS_sys_attr_s *attrs,
- struct iattr *iattr)
+static inline void copy_attributes_from_inode(struct inode *inode,
+ struct ORANGEFS_sys_attr_s *attrs)
{
- umode_t tmp_mode;
-
- if (!iattr || !inode || !attrs) {
- gossip_err("NULL iattr (%p), inode (%p), attrs (%p) "
- "in copy_attributes_from_inode!\n",
- iattr,
- inode,
- attrs);
- return -EINVAL;
- }
- /*
- * We need to be careful to only copy the attributes out of the
- * iattr object that we know are valid.
- */
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
attrs->mask = 0;
- if (iattr->ia_valid & ATTR_UID) {
- attrs->owner = from_kuid(&init_user_ns, iattr->ia_uid);
+ if (orangefs_inode->attr_valid & ATTR_UID) {
+ attrs->owner = from_kuid(&init_user_ns, inode->i_uid);
attrs->mask |= ORANGEFS_ATTR_SYS_UID;
gossip_debug(GOSSIP_UTILS_DEBUG, "(UID) %d\n", attrs->owner);
}
- if (iattr->ia_valid & ATTR_GID) {
- attrs->group = from_kgid(&init_user_ns, iattr->ia_gid);
+ if (orangefs_inode->attr_valid & ATTR_GID) {
+ attrs->group = from_kgid(&init_user_ns, inode->i_gid);
attrs->mask |= ORANGEFS_ATTR_SYS_GID;
gossip_debug(GOSSIP_UTILS_DEBUG, "(GID) %d\n", attrs->group);
}

- if (iattr->ia_valid & ATTR_ATIME) {
+ if (orangefs_inode->attr_valid & ATTR_ATIME) {
attrs->mask |= ORANGEFS_ATTR_SYS_ATIME;
- if (iattr->ia_valid & ATTR_ATIME_SET) {
- attrs->atime = (time64_t)iattr->ia_atime.tv_sec;
+ if (orangefs_inode->attr_valid & ATTR_ATIME_SET) {
+ attrs->atime = (time64_t)inode->i_atime.tv_sec;
attrs->mask |= ORANGEFS_ATTR_SYS_ATIME_SET;
}
}
- if (iattr->ia_valid & ATTR_MTIME) {
+ if (orangefs_inode->attr_valid & ATTR_MTIME) {
attrs->mask |= ORANGEFS_ATTR_SYS_MTIME;
- if (iattr->ia_valid & ATTR_MTIME_SET) {
- attrs->mtime = (time64_t)iattr->ia_mtime.tv_sec;
+ if (orangefs_inode->attr_valid & ATTR_MTIME_SET) {
+ attrs->mtime = (time64_t)inode->i_mtime.tv_sec;
attrs->mask |= ORANGEFS_ATTR_SYS_MTIME_SET;
}
}
- if (iattr->ia_valid & ATTR_CTIME)
+ if (orangefs_inode->attr_valid & ATTR_CTIME)
attrs->mask |= ORANGEFS_ATTR_SYS_CTIME;

/*
@@ -189,36 +175,10 @@ static inline int copy_attributes_from_inode(struct inode *inode,
* worry about ATTR_SIZE
*/

- if (iattr->ia_valid & ATTR_MODE) {
- tmp_mode = iattr->ia_mode;
- if (tmp_mode & (S_ISVTX)) {
- if (is_root_handle(inode)) {
- /*
- * allow sticky bit to be set on root (since
- * it shows up that way by default anyhow),
- * but don't show it to the server
- */
- tmp_mode -= S_ISVTX;
- } else {
- gossip_debug(GOSSIP_UTILS_DEBUG,
- "%s: setting sticky bit not supported.\n",
- __func__);
- return -EINVAL;
- }
- }
-
- if (tmp_mode & (S_ISUID)) {
- gossip_debug(GOSSIP_UTILS_DEBUG,
- "%s: setting setuid bit not supported.\n",
- __func__);
- return -EINVAL;
- }
-
- attrs->perms = ORANGEFS_util_translate_mode(tmp_mode);
+ if (orangefs_inode->attr_valid & ATTR_MODE) {
+ attrs->perms = ORANGEFS_util_translate_mode(inode->i_mode);
attrs->mask |= ORANGEFS_ATTR_SYS_PERM;
}
-
- return 0;
}

static int orangefs_inode_type(enum orangefs_ds_type objtype)
@@ -283,10 +243,16 @@ int orangefs_inode_getattr(struct inode *inode, int flags)
gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU flags %d\n",
__func__, get_khandle_from_ino(inode), flags);

+again:
spin_lock(&inode->i_lock);
/* Must have all the attributes in the mask and be within cache time. */
if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) ||
- inode->i_state & I_DIRTY) {
+ orangefs_inode->attr_valid) {
+ if (orangefs_inode->attr_valid) {
+ spin_unlock(&inode->i_lock);
+ write_inode_now(inode, 1);
+ goto again;
+ }
spin_unlock(&inode->i_lock);
return 0;
}
@@ -311,10 +277,16 @@ int orangefs_inode_getattr(struct inode *inode, int flags)
if (ret != 0)
goto out;

+again2:
spin_lock(&inode->i_lock);
/* Must have all the attributes in the mask and be within cache time. */
if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) ||
- inode->i_state & I_DIRTY) {
+ orangefs_inode->attr_valid) {
+ if (orangefs_inode->attr_valid) {
+ spin_unlock(&inode->i_lock);
+ write_inode_now(inode, 1);
+ goto again2;
+ }
gossip_debug(GOSSIP_UTILS_DEBUG, "%s: in cache or dirty\n",
__func__);
ret = 0;
@@ -438,7 +410,7 @@ int orangefs_inode_check_changed(struct inode *inode)
* issues a orangefs setattr request to make sure the new attribute values
* take effect if successful. returns 0 on success; -errno otherwise
*/
-int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr)
+int orangefs_inode_setattr(struct inode *inode)
{
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
struct orangefs_kernel_op_s *new_op;
@@ -448,24 +420,26 @@ int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr)
if (!new_op)
return -ENOMEM;

+ spin_lock(&inode->i_lock);
+ new_op->upcall.uid = from_kuid(&init_user_ns, orangefs_inode->attr_uid);
+ new_op->upcall.gid = from_kgid(&init_user_ns, orangefs_inode->attr_gid);
new_op->upcall.req.setattr.refn = orangefs_inode->refn;
- ret = copy_attributes_from_inode(inode,
- &new_op->upcall.req.setattr.attributes,
- iattr);
- if (ret >= 0) {
- ret = service_operation(new_op, __func__,
- get_interruptible_flag(inode));
+ copy_attributes_from_inode(inode,
+ &new_op->upcall.req.setattr.attributes);
+ orangefs_inode->attr_valid = 0;
+ spin_unlock(&inode->i_lock);

- gossip_debug(GOSSIP_UTILS_DEBUG,
- "orangefs_inode_setattr: returning %d\n",
- ret);
- }
+ ret = service_operation(new_op, __func__,
+ get_interruptible_flag(inode));
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "orangefs_inode_setattr: returning %d\n", ret);
+ if (ret)
+ orangefs_make_bad_inode(inode);

op_release(new_op);

if (ret == 0)
orangefs_inode->getattr_time = jiffies - 1;
-
return ret;
}

diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 788869c8233b..83abe5ec2d11 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -153,17 +153,8 @@ static void orangefs_destroy_inode(struct inode *inode)

int orangefs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- struct iattr iattr;
gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_write_inode\n");
- iattr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_ATIME |
- ATTR_ATIME_SET | ATTR_MTIME | ATTR_MTIME_SET | ATTR_CTIME;
- iattr.ia_mode = inode->i_mode;
- iattr.ia_uid = inode->i_uid;
- iattr.ia_gid = inode->i_gid;
- iattr.ia_atime = inode->i_atime;
- iattr.ia_mtime = inode->i_mtime;
- iattr.ia_ctime = inode->i_ctime;
- return orangefs_inode_setattr(inode, &iattr);
+ return orangefs_inode_setattr(inode);
}

/*
--
2.19.0


2018-10-07 23:30:49

by Martin Brandenburg

[permalink] [raw]
Subject: [PATCH 16/19] orangefs: use kmem_cache for orangefs_write_request

Signed-off-by: Martin Brandenburg <[email protected]>
---
fs/orangefs/inode.c | 10 +++++-----
fs/orangefs/orangefs-cache.c | 24 ++++++++++++++++++++++--
fs/orangefs/orangefs-kernel.h | 6 ++++--
fs/orangefs/orangefs-mod.c | 10 +++++-----
4 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 5c155b259b13..f53d768acdd9 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -66,7 +66,7 @@ static int orangefs_writepage_locked(struct page *page,
ret = 0;
if (wr) {
ClearPagePrivate(page);
- kfree(wr);
+ wr_release(wr);
}
}
end_page_writeback(page);
@@ -126,7 +126,7 @@ static int update_wr(struct page *page, loff_t pos, unsigned len, int mwrite)
else
wr->len = wr->pos + wr->len - wr->pos;
} else {
- wr = kmalloc(sizeof *wr, GFP_KERNEL);
+ wr = wr_alloc();
if (wr) {
wr->pos = pos;
wr->len = len;
@@ -272,7 +272,7 @@ static void orangefs_invalidatepage(struct page *page,
/* XXX prove */
if (offset == 0 && length == PAGE_SIZE) {
ClearPagePrivate(page);
- kfree(wr);
+ wr_release(wr);
} else if (wr->pos - page_offset(page) < offset &&
wr->pos - page_offset(page) + wr->len > offset + length) {
wbc.range_start = page_file_offset(page);
@@ -284,7 +284,7 @@ static void orangefs_invalidatepage(struct page *page,
return;
} else {
ClearPagePrivate(page);
- kfree(wr);
+ wr_release(wr);
}
} else if (wr->pos - page_offset(page) < offset &&
wr->pos - page_offset(page) + wr->len <= offset + length) {
@@ -299,7 +299,7 @@ static void orangefs_invalidatepage(struct page *page,
* entire write range is to be invalidated.
*/
ClearPagePrivate(page);
- kfree(wr);
+ wr_release(wr);
}
}
return;
diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c
index 3b6982bf6bcf..cfb405ca8aa5 100644
--- a/fs/orangefs/orangefs-cache.c
+++ b/fs/orangefs/orangefs-cache.c
@@ -16,8 +16,9 @@ static DEFINE_SPINLOCK(next_tag_value_lock);

/* a cache for orangefs upcall/downcall operations */
static struct kmem_cache *op_cache;
+static struct kmem_cache *wr_cache;

-int op_cache_initialize(void)
+int orangefs_caches_initialize(void)
{
op_cache = kmem_cache_create("orangefs_op_cache",
sizeof(struct orangefs_kernel_op_s),
@@ -34,12 +35,21 @@ int op_cache_initialize(void)
spin_lock(&next_tag_value_lock);
next_tag_value = 100;
spin_unlock(&next_tag_value_lock);
+
+ wr_cache = kmem_cache_create("orangefs_wr_cache",
+ sizeof(struct orangefs_write_request), 0, ORANGEFS_CACHE_CREATE_FLAGS, NULL);
+ if (!wr_cache) {
+ gossip_err("Cannot create orangefs_wr_cache\n");
+ return -ENOMEM;
+ }
+
return 0;
}

-int op_cache_finalize(void)
+int orangefs_caches_finalize(void)
{
kmem_cache_destroy(op_cache);
+ kmem_cache_destroy(wr_cache);
return 0;
}

@@ -162,3 +172,13 @@ void op_release(struct orangefs_kernel_op_s *orangefs_op)
gossip_err("NULL pointer in op_release\n");
}
}
+
+struct orangefs_write_request *wr_alloc(void)
+{
+ return kmem_cache_zalloc(wr_cache, GFP_KERNEL);
+}
+
+void wr_release(struct orangefs_write_request *wr)
+{
+ kmem_cache_free(wr_cache, wr);
+}
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 2e9726d1de7d..256851bab7a5 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -311,11 +311,13 @@ static inline int match_handle(struct orangefs_khandle resp_handle,
/*
* defined in orangefs-cache.c
*/
-int op_cache_initialize(void);
-int op_cache_finalize(void);
+int orangefs_caches_initialize(void);
+int orangefs_caches_finalize(void);
struct orangefs_kernel_op_s *op_alloc(__s32 type);
void orangefs_new_tag(struct orangefs_kernel_op_s *op);
char *get_opname_string(struct orangefs_kernel_op_s *new_op);
+struct orangefs_write_request *wr_alloc(void);
+void wr_release(struct orangefs_write_request *);

int orangefs_inode_cache_initialize(void);
int orangefs_inode_cache_finalize(void);
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
index 85ef87245a87..c7373e682653 100644
--- a/fs/orangefs/orangefs-mod.c
+++ b/fs/orangefs/orangefs-mod.c
@@ -87,13 +87,13 @@ static int __init orangefs_init(void)
slot_timeout_secs = 0;

/* initialize global book keeping data structures */
- ret = op_cache_initialize();
+ ret = orangefs_caches_initialize();
if (ret < 0)
goto out;

ret = orangefs_inode_cache_initialize();
if (ret < 0)
- goto cleanup_op;
+ goto cleanup_caches;

orangefs_htable_ops_in_progress =
kcalloc(hash_table_size, sizeof(struct list_head), GFP_KERNEL);
@@ -172,8 +172,8 @@ static int __init orangefs_init(void)
cleanup_inode:
orangefs_inode_cache_finalize();

-cleanup_op:
- op_cache_finalize();
+cleanup_caches:
+ orangefs_caches_finalize();

out:
return ret;
@@ -194,7 +194,7 @@ static void __exit orangefs_exit(void)
BUG_ON(!list_empty(&orangefs_htable_ops_in_progress[i]));

orangefs_inode_cache_finalize();
- op_cache_finalize();
+ orangefs_caches_finalize();

kfree(orangefs_htable_ops_in_progress);

--
2.19.0


2018-10-07 23:30:49

by Martin Brandenburg

[permalink] [raw]
Subject: [PATCH 14/19] orangefs: write range tracking

This is necessary to ensure the uid/gid responsible for the write is
communicated with the server. Only one uid/gid may have outstanding
changes at a time. If another uid/gid writes while there are
outstanding changes, the changes must be written out before the new
data is put into the page.

Signed-off-by: Martin Brandenburg <[email protected]>
---
fs/orangefs/file.c | 12 +-
fs/orangefs/inode.c | 267 ++++++++++++++++++++++++++++++----
fs/orangefs/orangefs-kernel.h | 12 +-
3 files changed, 261 insertions(+), 30 deletions(-)

diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index ba580a5c6fd2..5eda483263ae 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -46,8 +46,8 @@ static int flush_racache(struct inode *inode)
* Post and wait for the I/O upcall to finish
*/
ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
- loff_t *offset, struct iov_iter *iter,
- size_t total_size, loff_t readahead_size)
+ loff_t *offset, struct iov_iter *iter, size_t total_size,
+ loff_t readahead_size, struct orangefs_write_request *wr)
{
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
@@ -103,6 +103,10 @@ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
__func__, (long)ret);
goto out;
}
+ if (wr) {
+ new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid);
+ new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid);
+ }
}

gossip_debug(GOSSIP_FILE_DEBUG,
@@ -292,7 +296,7 @@ ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file,
(int)*offset);

ret = wait_for_direct_io(type, inode, offset, iter,
- each_count, 0);
+ each_count, 0, NULL);
gossip_debug(GOSSIP_FILE_DEBUG,
"%s(%pU): return from wait_for_io:%d\n",
__func__,
@@ -434,7 +438,7 @@ static vm_fault_t orangefs_fault(struct vm_fault *vmf)
static const struct vm_operations_struct orangefs_file_vm_ops = {
.fault = orangefs_fault,
.map_pages = filemap_map_pages,
- .page_mkwrite = filemap_page_mkwrite,
+ .page_mkwrite = orangefs_page_mkwrite,
};

/*
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index bd2ce18453f2..5c155b259b13 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -15,9 +15,11 @@
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"

-static int orangefs_writepage(struct page *page, struct writeback_control *wbc)
+static int orangefs_writepage_locked(struct page *page,
+ struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
+ struct orangefs_write_request *wr;
struct iov_iter iter;
struct bio_vec bv;
size_t len, wlen;
@@ -26,33 +28,175 @@ static int orangefs_writepage(struct page *page, struct writeback_control *wbc)

set_page_writeback(page);

- off = page_offset(page);
- len = i_size_read(inode);
- if (off + PAGE_SIZE > len)
- wlen = len - off;
- else
- wlen = PAGE_SIZE;
+ if (PagePrivate(page)) {
+ wr = (struct orangefs_write_request *)page_private(page);
+ BUG_ON(!wr);
+ if (wr->mwrite) {
+ off = page_offset(page);
+ len = i_size_read(inode);
+ if (off + PAGE_SIZE > len)
+ wlen = len - off;
+ else
+ wlen = PAGE_SIZE;
+ } else {
+ off = wr->pos;
+ wlen = wr->len;
+ len = i_size_read(inode);
+ }
+ } else {
+/* BUG();*/
+ /* It's not private so there's nothing to write, right? */
+ printk("writepage not private!\n");
+ end_page_writeback(page);
+ return 0;
+
+ }

bv.bv_page = page;
bv.bv_len = wlen;
bv.bv_offset = off % PAGE_SIZE;
- if (wlen == 0)
- dump_stack();
iov_iter_bvec(&iter, ITER_BVEC | WRITE, &bv, 1, wlen);

ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen,
- len);
+ len, wr);
if (ret < 0) {
SetPageError(page);
mapping_set_error(page->mapping, ret);
} else {
ret = 0;
+ if (wr) {
+ ClearPagePrivate(page);
+ kfree(wr);
+ }
}
end_page_writeback(page);
- unlock_page(page);
return ret;
}

+static int do_writepage_if_necessary(struct page *page, loff_t pos,
+ unsigned len)
+{
+ struct orangefs_write_request *wr;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = 0,
+ };
+ int r;
+ if (PagePrivate(page)) {
+ wr = (struct orangefs_write_request *)page_private(page);
+ BUG_ON(!wr);
+ /*
+ * If the new request is not contiguous with the last one or if
+ * the uid or gid is different, the page must be written out
+ * before continuing.
+ */
+ if (pos + len < wr->pos || wr->pos + wr->len < pos ||
+ !uid_eq(current_fsuid(), wr->uid) ||
+ !gid_eq(current_fsgid(), wr->gid)) {
+ wbc.range_start = page_file_offset(page);
+ wbc.range_end = wbc.range_start + PAGE_SIZE - 1;
+ wait_on_page_writeback(page);
+ if (clear_page_dirty_for_io(page)) {
+ r = orangefs_writepage_locked(page, &wbc);
+ if (r)
+ return r;
+ }
+ BUG_ON(PagePrivate(page));
+ }
+ }
+ return 0;
+}
+
+static int update_wr(struct page *page, loff_t pos, unsigned len, int mwrite)
+{
+ struct orangefs_write_request *wr;
+ if (PagePrivate(page)) {
+ wr = (struct orangefs_write_request *)page_private(page);
+ BUG_ON(!wr);
+ if (mwrite) {
+ wr->mwrite = 1;
+ return 0;
+ }
+ if (pos < wr->pos) {
+ wr->len += wr->pos - pos;
+ wr->pos = pos;
+ }
+ if (pos + len > wr->pos + wr->len)
+ wr->len = pos + len - wr->pos;
+ else
+ wr->len = wr->pos + wr->len - wr->pos;
+ } else {
+ wr = kmalloc(sizeof *wr, GFP_KERNEL);
+ if (wr) {
+ wr->pos = pos;
+ wr->len = len;
+ wr->uid = current_fsuid();
+ wr->gid = current_fsgid();
+ wr->mwrite = mwrite;
+ SetPagePrivate(page);
+ set_page_private(page, (unsigned long)wr);
+ } else {
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
+int orangefs_page_mkwrite(struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ unsigned len;
+ int r;
+
+ /* Do not write past the file size. */
+ len = i_size_read(inode) - page_file_offset(page);
+ if (len > PAGE_SIZE)
+ len = PAGE_SIZE;
+
+ lock_page(page);
+ r = do_writepage_if_necessary(page, page_file_offset(page),
+ len);
+ if (r) {
+ r = VM_FAULT_RETRY;
+ unlock_page(vmf->page);
+ return r;
+ }
+ r = update_wr(page, page_file_offset(page), len, 1);
+ if (r) {
+ r = VM_FAULT_RETRY;
+ unlock_page(vmf->page);
+ return r;
+ }
+
+ r = VM_FAULT_LOCKED;
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vmf->vma->vm_file);
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ r = VM_FAULT_NOPAGE;
+ goto out;
+ }
+ /*
+ * We mark the page dirty already here so that when freeze is in
+ * progress, we are guaranteed that writeback during freezing will
+ * see the dirty page and writeprotect it again.
+ */
+ set_page_dirty(page);
+ wait_for_stable_page(page);
+out:
+ sb_end_pagefault(inode->i_sb);
+ return r;
+}
+
+static int orangefs_writepage(struct page *page, struct writeback_control *wbc)
+{
+ int r;
+ r = orangefs_writepage_locked(page, wbc);
+ unlock_page(page);
+ return r;
+}
+
static int orangefs_readpage(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
@@ -68,7 +212,7 @@ static int orangefs_readpage(struct file *file, struct page *page)
iov_iter_bvec(&iter, ITER_BVEC | READ, &bv, 1, PAGE_SIZE);

ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
- PAGE_SIZE, inode->i_size);
+ PAGE_SIZE, inode->i_size, NULL);
/* this will only zero remaining unread portions of the page data */
iov_iter_zero(~0U, &iter);
/* takes care of potential aliasing */
@@ -86,10 +230,26 @@ static int orangefs_readpage(struct file *file, struct page *page)
return ret;
}

+static int orangefs_write_begin(struct file *file,
+ struct address_space *mapping, loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ int r;
+ r = simple_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
+ if (r)
+ return r;
+ r = do_writepage_if_necessary(*pagep, pos, len);
+ if (r)
+ unlock_page(*pagep);
+ return r;
+}
+
int orangefs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata)
{
int r;
+ if (update_wr(page, pos, len, 0))
+ return -ENOMEM;
r = simple_write_end(file, mapping, pos, len, copied, page, fsdata);
mark_inode_dirty_sync(file_inode(file));
return r;
@@ -99,24 +259,68 @@ static void orangefs_invalidatepage(struct page *page,
unsigned int offset,
unsigned int length)
{
- gossip_debug(GOSSIP_INODE_DEBUG,
- "orangefs_invalidatepage called on page %p "
- "(offset is %u)\n",
- page,
- offset);
-
- ClearPageUptodate(page);
- ClearPageMappedToDisk(page);
+ struct orangefs_write_request *wr;
+ /* XXX move to releasepage and call + rebase */
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = 0,
+ };
+ int r;
+ if (PagePrivate(page)) {
+ wr = (struct orangefs_write_request *)page_private(page);
+ BUG_ON(!wr);
+/* XXX prove */
+ if (offset == 0 && length == PAGE_SIZE) {
+ ClearPagePrivate(page);
+ kfree(wr);
+ } else if (wr->pos - page_offset(page) < offset &&
+ wr->pos - page_offset(page) + wr->len > offset + length) {
+ wbc.range_start = page_file_offset(page);
+ wbc.range_end = wbc.range_start + PAGE_SIZE - 1;
+ wait_on_page_writeback(page);
+ if (clear_page_dirty_for_io(page)) {
+ r = orangefs_writepage_locked(page, &wbc);
+ if (r)
+ return;
+ } else {
+ ClearPagePrivate(page);
+ kfree(wr);
+ }
+ } else if (wr->pos - page_offset(page) < offset &&
+ wr->pos - page_offset(page) + wr->len <= offset + length) {
+ wr->len = offset;
+ } else if (wr->pos - page_offset(page) >= offset &&
+ wr->pos - page_offset(page) + wr->len > offset + length) {
+ wr->pos += length - wr->pos + page_offset(page);
+ wr->len -= length - wr->pos + page_offset(page);
+ } else {
+ /*
+ * Invalidate range is bigger than write range but
+ * entire write range is to be invalidated.
+ */
+ ClearPagePrivate(page);
+ kfree(wr);
+ }
+ }
return;

}

static int orangefs_releasepage(struct page *page, gfp_t foo)
{
- gossip_debug(GOSSIP_INODE_DEBUG,
- "orangefs_releasepage called on page %p\n",
- page);
- return 0;
+ /*
+ * Two cases are mentioned in vfs.txt. Only one is relevant
+ * "VM finds a clean page with no active users and wants to make it a
+ * free page" However this page will not be private.
+ * "request has been made to invalidate some or all pages in an
+ * address_space" So we call orangefs_invalidatepage.
+ */
+ if (PagePrivate(page)) {
+ orangefs_invalidatepage(page, 0, PAGE_SIZE);
+ return !PagePrivate(page);
+ } else {
+ return 1;
+ }
}

static ssize_t orangefs_direct_IO(struct kiocb *iocb,
@@ -128,16 +332,29 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
ORANGEFS_IO_WRITE : ORANGEFS_IO_READ, file, &pos, iter);
}

+static int orangefs_launder_page(struct page *page)
+{
+ int r = 0;
+ if (PagePrivate(page)) {
+ if (clear_page_dirty_for_io(page))
+ r = orangefs_writepage_locked(page, NULL);
+ return r;
+ } else {
+ return 0;
+ }
+}
+
/** ORANGEFS2 implementation of address space operations */
static const struct address_space_operations orangefs_address_operations = {
.writepage = orangefs_writepage,
.readpage = orangefs_readpage,
.set_page_dirty = __set_page_dirty_nobuffers,
- .write_begin = simple_write_begin,
+ .write_begin = orangefs_write_begin,
.write_end = orangefs_write_end,
.invalidatepage = orangefs_invalidatepage,
.releasepage = orangefs_releasepage,
.direct_IO = orangefs_direct_IO,
+ .launder_page = orangefs_launder_page,
};

static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index e128500e33b4..2e9726d1de7d 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -178,6 +178,14 @@ static inline void set_op_state_purged(struct orangefs_kernel_op_s *op)
}
}

+struct orangefs_write_request {
+ loff_t pos;
+ unsigned len;
+ kuid_t uid;
+ kgid_t gid;
+ int mwrite;
+};
+
/* per inode private orangefs info */
struct orangefs_inode_s {
struct orangefs_object_kref refn;
@@ -341,6 +349,8 @@ void fsid_key_table_finalize(void);
/*
* defined in inode.c
*/
+int orangefs_page_mkwrite(struct vm_fault *);
+
struct inode *orangefs_new_inode(struct super_block *sb,
struct inode *dir,
int mode,
@@ -382,7 +392,7 @@ bool __is_daemon_in_service(void);
* defined in file.c
*/
ssize_t wait_for_direct_io(enum ORANGEFS_io_type, struct inode *, loff_t *,
- struct iov_iter *, size_t, loff_t);
+ struct iov_iter *, size_t, loff_t, struct orangefs_write_request *);
ssize_t do_readv_writev(enum ORANGEFS_io_type, struct file *, loff_t *,
struct iov_iter *);

--
2.19.0


2018-10-07 23:31:02

by Martin Brandenburg

[permalink] [raw]
Subject: [PATCH 05/19] orangefs: hold i_lock during inode_getattr

This should be a no-op now. When inode writeback works, this will
prevent a getattr from overwriting inode data while an inode is
transitioning to dirty.

Signed-off-by: Martin Brandenburg <[email protected]>
---
fs/orangefs/inode.c | 4 ++--
fs/orangefs/orangefs-utils.c | 33 +++++++++++++++++++++++----------
2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 067fd7111103..ec3996a61f92 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -253,8 +253,8 @@ int orangefs_getattr(const struct path *path, struct kstat *stat,
struct inode *inode = path->dentry->d_inode;

gossip_debug(GOSSIP_INODE_DEBUG,
- "orangefs_getattr: called on %pd\n",
- path->dentry);
+ "orangefs_getattr: called on %pd mask %u\n",
+ path->dentry, request_mask);

ret = orangefs_inode_getattr(inode,
request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0);
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 76f18a3494c7..d44cbe96719a 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -280,12 +280,17 @@ int orangefs_inode_getattr(struct inode *inode, int flags)
loff_t inode_size;
int ret, type;

- gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__,
- get_khandle_from_ino(inode));
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU flags %d\n",
+ __func__, get_khandle_from_ino(inode), flags);

+ spin_lock(&inode->i_lock);
/* Must have all the attributes in the mask and be within cache time. */
- if (!flags && time_before(jiffies, orangefs_inode->getattr_time))
+ if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) ||
+ inode->i_state & I_DIRTY) {
+ spin_unlock(&inode->i_lock);
return 0;
+ }
+ spin_unlock(&inode->i_lock);

new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR);
if (!new_op)
@@ -306,13 +311,23 @@ int orangefs_inode_getattr(struct inode *inode, int flags)
if (ret != 0)
goto out;

+ spin_lock(&inode->i_lock);
+ /* Must have all the attributes in the mask and be within cache time. */
+ if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) ||
+ inode->i_state & I_DIRTY) {
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: in cache or dirty\n",
+ __func__);
+ ret = 0;
+ goto out_unlock;
+ }
+
if (!(flags & ORANGEFS_GETATTR_NEW)) {
ret = orangefs_inode_is_stale(inode,
&new_op->downcall.resp.getattr.attributes,
new_op->downcall.resp.getattr.link_target);
if (ret) {
ret = -ESTALE;
- goto out;
+ goto out_unlock;
}
}

@@ -328,19 +343,15 @@ int orangefs_inode_getattr(struct inode *inode, int flags)
inode->i_size = inode_size;
inode->i_blkbits = ffs(new_op->downcall.resp.getattr.
attributes.blksize);
- spin_lock(&inode->i_lock);
inode->i_bytes = inode_size;
inode->i_blocks =
(inode_size + 512 - inode_size % 512)/512;
- spin_unlock(&inode->i_lock);
}
break;
case S_IFDIR:
if (flags) {
inode->i_size = PAGE_SIZE;
- spin_lock(&inode->i_lock);
inode_set_bytes(inode, inode->i_size);
- spin_unlock(&inode->i_lock);
}
set_nlink(inode, 1);
break;
@@ -353,7 +364,7 @@ int orangefs_inode_getattr(struct inode *inode, int flags)
ORANGEFS_NAME_MAX);
if (ret == -E2BIG) {
ret = -EIO;
- goto out;
+ goto out_unlock;
}
inode->i_link = orangefs_inode->link_target;
}
@@ -363,7 +374,7 @@ int orangefs_inode_getattr(struct inode *inode, int flags)
/* XXX: ESTALE? This is what is done if it is not new. */
orangefs_make_bad_inode(inode);
ret = -ESTALE;
- goto out;
+ goto out_unlock;
}

inode->i_uid = make_kuid(&init_user_ns, new_op->
@@ -387,6 +398,8 @@ int orangefs_inode_getattr(struct inode *inode, int flags)
orangefs_inode->getattr_time = jiffies +
orangefs_getattr_timeout_msecs*HZ/1000;
ret = 0;
+out_unlock:
+ spin_unlock(&inode->i_lock);
out:
op_release(new_op);
return ret;
--
2.19.0


2018-10-07 23:31:06

by Martin Brandenburg

[permalink] [raw]
Subject: [PATCH 04/19] orangefs: update attributes rather than relying on server

This should be a no-op now, but once inode writeback works, it'll be
necessary to have the correct attribute in the dirty inode.

Previously the attribute fetch timeout was marked invalid and the server
provided the updated attribute. When the inode is dirty, the server
cannot be consulted since it does not yet know the pending setattr.

Signed-off-by: Martin Brandenburg <[email protected]>
---
fs/orangefs/file.c | 10 ++--------
fs/orangefs/namei.c | 7 ++++++-
2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 3ab6e5126899..aec17635a50f 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -327,14 +327,8 @@ static ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file,
file_accessed(file);
} else {
file_update_time(file);
- /*
- * Must invalidate to ensure write loop doesn't
- * prevent kernel from reading updated
- * attribute. Size probably changed because of
- * the write, and other clients could update
- * any other attribute.
- */
- orangefs_inode->getattr_time = jiffies - 1;
+ if (*offset > i_size_read(inode))
+ i_size_write(inode, *offset);
}
}

diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 46b5f06b7e4c..7b82fc09291c 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -383,6 +383,7 @@ static int orangefs_rename(struct inode *old_dir,
unsigned int flags)
{
struct orangefs_kernel_op_s *new_op;
+ struct iattr iattr;
int ret;

if (flags)
@@ -392,7 +393,11 @@ static int orangefs_rename(struct inode *old_dir,
"orangefs_rename: called (%pd2 => %pd2) ct=%d\n",
old_dentry, new_dentry, d_count(new_dentry));

- ORANGEFS_I(new_dentry->d_parent->d_inode)->getattr_time = jiffies - 1;
+ new_dir->i_mtime = new_dir->i_ctime = current_time(new_dir);
+ memset(&iattr, 0, sizeof iattr);
+ iattr.ia_valid |= ATTR_MTIME;
+ orangefs_inode_setattr(new_dir, &iattr);
+ mark_inode_dirty_sync(new_dir);

new_op = op_alloc(ORANGEFS_VFS_OP_RENAME);
if (!new_op)
--
2.19.0


2018-10-07 23:31:23

by Martin Brandenburg

[permalink] [raw]
Subject: [PATCH 03/19] orangefs: simplify orangefs_inode_getattr interface

No need to store the received mask. It is either STATX_BASIC_STATS or
STATX_BASIC_STATS & ~STATX_SIZE. If STATX_SIZE is requested, the cache
is bypassed anyway, so the cached mask is unnecessary to decide whether
to do a real getattr.

This is a change. Previously a getattr would want size and use the
cached size. All of the in-kernel callers that wanted size did not want
a cached size. Now a getattr cannot use the cached size if it wants
size at all.

Signed-off-by: Martin Brandenburg <[email protected]>
---
fs/orangefs/file.c | 17 ++++++++---------
fs/orangefs/inode.c | 11 ++++++-----
fs/orangefs/orangefs-kernel.h | 7 ++++---
fs/orangefs/orangefs-utils.c | 31 ++++++++++---------------------
4 files changed, 28 insertions(+), 38 deletions(-)

diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index a5a2fe76568f..3ab6e5126899 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -424,8 +424,8 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite

/* Make sure generic_write_checks sees an up to date inode size. */
if (file->f_flags & O_APPEND) {
- rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
- STATX_SIZE);
+ rc = orangefs_inode_getattr(file->f_mapping->host,
+ ORANGEFS_GETATTR_SIZE);
if (rc == -ESTALE)
rc = -EIO;
if (rc) {
@@ -532,14 +532,13 @@ static vm_fault_t orangefs_fault(struct vm_fault *vmf)
{
struct file *file = vmf->vma->vm_file;
int ret;
-
- ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
- STATX_SIZE);
+ ret = orangefs_inode_getattr(file->f_mapping->host,
+ ORANGEFS_GETATTR_SIZE);
if (ret == -ESTALE)
ret = -EIO;
if (ret) {
- gossip_err("%s: orangefs_inode_getattr failed, ret:%d:.\n",
- __func__, ret);
+ gossip_err("%s: orangefs_inode_getattr failed, "
+ "ret:%d:.\n", __func__, ret);
return VM_FAULT_SIGBUS;
}
return filemap_fault(vmf);
@@ -660,8 +659,8 @@ static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin)
* NOTE: We are only interested in file size here,
* so we set mask accordingly.
*/
- ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
- STATX_SIZE);
+ ret = orangefs_inode_getattr(file->f_mapping->host,
+ ORANGEFS_GETATTR_SIZE);
if (ret == -ESTALE)
ret = -EIO;
if (ret) {
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 2f1a5f36a103..067fd7111103 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -162,7 +162,7 @@ static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
iattr->ia_size);

/* Ensure that we have a up to date size, so we know if it changed. */
- ret = orangefs_inode_getattr(inode, 0, 1, STATX_SIZE);
+ ret = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_SIZE);
if (ret == -ESTALE)
ret = -EIO;
if (ret) {
@@ -256,7 +256,8 @@ int orangefs_getattr(const struct path *path, struct kstat *stat,
"orangefs_getattr: called on %pd\n",
path->dentry);

- ret = orangefs_inode_getattr(inode, 0, 0, request_mask);
+ ret = orangefs_inode_getattr(inode,
+ request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0);
if (ret == 0) {
generic_fillattr(inode, stat);

@@ -287,7 +288,7 @@ int orangefs_permission(struct inode *inode, int mask)
gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__);

/* Make sure the permission (and other common attrs) are up to date. */
- ret = orangefs_inode_getattr(inode, 0, 0, STATX_MODE);
+ ret = orangefs_inode_getattr(inode, 0);
if (ret < 0)
return ret;

@@ -409,7 +410,7 @@ struct inode *orangefs_iget(struct super_block *sb,
if (!inode || !(inode->i_state & I_NEW))
return inode;

- error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL);
+ error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW);
if (error) {
iget_failed(inode);
return ERR_PTR(error);
@@ -454,7 +455,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
orangefs_set_inode(inode, ref);
inode->i_ino = hash; /* needed for stat etc */

- error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL);
+ error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW);
if (error)
goto out_iput;

diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 0c76b8899fd1..6b21ec59738c 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -192,7 +192,6 @@ struct orangefs_inode_s {
sector_t last_failed_block_index_read;

unsigned long getattr_time;
- u32 getattr_mask;

DECLARE_HASHTABLE(xattr_cache, 4);
};
@@ -396,8 +395,10 @@ int orangefs_inode_setxattr(struct inode *inode,
size_t size,
int flags);

-int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
- u32 request_mask);
+#define ORANGEFS_GETATTR_NEW 1
+#define ORANGEFS_GETATTR_SIZE 2
+
+int orangefs_inode_getattr(struct inode *, int);

int orangefs_inode_check_changed(struct inode *inode);

diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 804c8a261e4b..76f18a3494c7 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* (C) 2001 Clemson University and The University of Chicago
+ * Copyright 2018 Omnibond Systems, L.L.C.
*
* See COPYING in top-level directory.
*/
@@ -272,8 +273,7 @@ static int orangefs_inode_is_stale(struct inode *inode,
return 0;
}

-int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
- u32 request_mask)
+int orangefs_inode_getattr(struct inode *inode, int flags)
{
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
struct orangefs_kernel_op_s *new_op;
@@ -283,16 +283,9 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__,
get_khandle_from_ino(inode));

- if (!new && !bypass) {
- /*
- * Must have all the attributes in the mask and be within cache
- * time.
- */
- if ((request_mask & orangefs_inode->getattr_mask) ==
- request_mask &&
- time_before(jiffies, orangefs_inode->getattr_time))
- return 0;
- }
+ /* Must have all the attributes in the mask and be within cache time. */
+ if (!flags && time_before(jiffies, orangefs_inode->getattr_time))
+ return 0;

new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR);
if (!new_op)
@@ -302,7 +295,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
* Size is the hardest attribute to get. The incremental cost of any
* other attribute is essentially zero.
*/
- if (request_mask & STATX_SIZE || new)
+ if (flags)
new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT;
else
new_op->upcall.req.getattr.mask =
@@ -313,7 +306,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
if (ret != 0)
goto out;

- if (!new) {
+ if (!(flags & ORANGEFS_GETATTR_NEW)) {
ret = orangefs_inode_is_stale(inode,
&new_op->downcall.resp.getattr.attributes,
new_op->downcall.resp.getattr.link_target);
@@ -329,7 +322,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
case S_IFREG:
inode->i_flags = orangefs_inode_flags(&new_op->
downcall.resp.getattr.attributes);
- if (request_mask & STATX_SIZE || new) {
+ if (flags) {
inode_size = (loff_t)new_op->
downcall.resp.getattr.attributes.size;
inode->i_size = inode_size;
@@ -343,7 +336,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
}
break;
case S_IFDIR:
- if (request_mask & STATX_SIZE || new) {
+ if (flags) {
inode->i_size = PAGE_SIZE;
spin_lock(&inode->i_lock);
inode_set_bytes(inode, inode->i_size);
@@ -352,7 +345,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
set_nlink(inode, 1);
break;
case S_IFLNK:
- if (new) {
+ if (flags & ORANGEFS_GETATTR_NEW) {
inode->i_size = (loff_t)strlen(new_op->
downcall.resp.getattr.link_target);
ret = strscpy(orangefs_inode->link_target,
@@ -393,10 +386,6 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass,

orangefs_inode->getattr_time = jiffies +
orangefs_getattr_timeout_msecs*HZ/1000;
- if (request_mask & STATX_SIZE || new)
- orangefs_inode->getattr_mask = STATX_BASIC_STATS;
- else
- orangefs_inode->getattr_mask = STATX_BASIC_STATS & ~STATX_SIZE;
ret = 0;
out:
op_release(new_op);
--
2.19.0


2018-10-07 23:31:24

by Martin Brandenburg

[permalink] [raw]
Subject: [PATCH 01/19] orangefs: implement xattr cache

This uses the same timeout as the getattr cache. This substantially
increases performance when writing files with smaller buffer sizes.

When writing, the size is (often) changed, which causes a call to
notify_change which calls security_inode_need_killpriv which needs a
getxattr. Caching it reduces traffic to the server.

Signed-off-by: Martin Brandenburg <[email protected]>
---
fs/orangefs/inode.c | 1 +
fs/orangefs/orangefs-kernel.h | 10 ++++
fs/orangefs/super.c | 9 +++
fs/orangefs/xattr.c | 104 ++++++++++++++++++++++++++++++++++
4 files changed, 124 insertions(+)

diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 31932879b716..a7a8d3647ffe 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -367,6 +367,7 @@ static int orangefs_set_inode(struct inode *inode, void *data)
struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data;
ORANGEFS_I(inode)->refn.fs_id = ref->fs_id;
ORANGEFS_I(inode)->refn.khandle = ref->khandle;
+ hash_init(ORANGEFS_I(inode)->xattr_cache);
return 0;
}

diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 17b24ad6b264..0c76b8899fd1 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -193,6 +193,8 @@ struct orangefs_inode_s {

unsigned long getattr_time;
u32 getattr_mask;
+
+ DECLARE_HASHTABLE(xattr_cache, 4);
};

/* per superblock private orangefs info */
@@ -217,6 +219,14 @@ struct orangefs_stats {
unsigned long writes;
};

+struct orangefs_cached_xattr {
+ struct hlist_node node;
+ char key[ORANGEFS_MAX_XATTR_NAMELEN];
+ char val[ORANGEFS_MAX_XATTR_VALUELEN];
+ ssize_t length;
+ unsigned long timeout;
+};
+
extern struct orangefs_stats orangefs_stats;

/*
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index dfaee90d30bd..4c36481208f5 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -128,6 +128,15 @@ static void orangefs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_cached_xattr *cx;
+ struct hlist_node *tmp;
+ int i;
+
+ hash_for_each_safe(orangefs_inode->xattr_cache, i, tmp, cx, node) {
+ hlist_del(&cx->node);
+ kfree(cx);
+ }
+
kmem_cache_free(orangefs_inode_cache, orangefs_inode);
}

diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index 03bcb871544d..998c3563bcdd 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* (C) 2001 Clemson University and The University of Chicago
+ * Copyright 2018 Omnibond Systems, L.L.C.
*
* See COPYING in top-level directory.
*/
@@ -50,6 +51,35 @@ static inline int convert_to_internal_xattr_flags(int setxattr_flags)
return internal_flag;
}

+static unsigned int xattr_key(const char *key)
+{
+ unsigned int i = 0;
+ while (key)
+ i += *key++;
+ return i % 16;
+}
+
+static struct orangefs_cached_xattr *find_cached_xattr(struct inode *inode,
+ const char *key)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_cached_xattr *cx;
+ struct hlist_head *h;
+ struct hlist_node *tmp;
+ h = &orangefs_inode->xattr_cache[xattr_key(key)];
+ if (hlist_empty(h))
+ return NULL;
+ hlist_for_each_entry_safe(cx, tmp, h, node) {
+/* if (!time_before(jiffies, cx->timeout)) {
+ hlist_del(&cx->node);
+ kfree(cx);
+ continue;
+ }*/
+ if (!strcmp(cx->key, key))
+ return cx;
+ }
+ return NULL;
+}

/*
* Tries to get a specified key's attributes of a given
@@ -65,6 +95,7 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name,
{
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
struct orangefs_kernel_op_s *new_op = NULL;
+ struct orangefs_cached_xattr *cx;
ssize_t ret = -ENOMEM;
ssize_t length = 0;
int fsuid;
@@ -93,6 +124,27 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name,

down_read(&orangefs_inode->xattr_sem);

+ cx = find_cached_xattr(inode, name);
+ if (cx && time_before(jiffies, cx->timeout)) {
+ if (cx->length == -1) {
+ ret = -ENODATA;
+ goto out_unlock;
+ } else {
+ if (size == 0) {
+ ret = cx->length;
+ goto out_unlock;
+ }
+ if (cx->length > size) {
+ ret = -ERANGE;
+ goto out_unlock;
+ }
+ memcpy(buffer, cx->val, cx->length);
+ memset(buffer + cx->length, 0, size - cx->length);
+ ret = cx->length;
+ goto out_unlock;
+ }
+ }
+
new_op = op_alloc(ORANGEFS_VFS_OP_GETXATTR);
if (!new_op)
goto out_unlock;
@@ -117,6 +169,15 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name,
" does not exist!\n",
get_khandle_from_ino(inode),
(char *)new_op->upcall.req.getxattr.key);
+ cx = kmalloc(sizeof *cx, GFP_KERNEL);
+ if (cx) {
+ strcpy(cx->key, name);
+ cx->length = -1;
+ cx->timeout = jiffies +
+ orangefs_getattr_timeout_msecs*HZ/1000;
+ hash_add(orangefs_inode->xattr_cache, &cx->node,
+ xattr_key(cx->key));
+ }
}
goto out_release_op;
}
@@ -156,6 +217,23 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name,

ret = length;

+ if (cx) {
+ strcpy(cx->key, name);
+ memcpy(cx->val, buffer, length);
+ cx->length = length;
+ cx->timeout = jiffies + HZ;
+ } else {
+ cx = kmalloc(sizeof *cx, GFP_KERNEL);
+ if (cx) {
+ strcpy(cx->key, name);
+ memcpy(cx->val, buffer, length);
+ cx->length = length;
+ cx->timeout = jiffies + HZ;
+ hash_add(orangefs_inode->xattr_cache, &cx->node,
+ xattr_key(cx->key));
+ }
+ }
+
out_release_op:
op_release(new_op);
out_unlock:
@@ -168,6 +246,9 @@ static int orangefs_inode_removexattr(struct inode *inode, const char *name,
{
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
struct orangefs_kernel_op_s *new_op = NULL;
+ struct orangefs_cached_xattr *cx;
+ struct hlist_head *h;
+ struct hlist_node *tmp;
int ret = -ENOMEM;

if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN)
@@ -209,6 +290,16 @@ static int orangefs_inode_removexattr(struct inode *inode, const char *name,
"orangefs_inode_removexattr: returning %d\n", ret);

op_release(new_op);
+
+ h = &orangefs_inode->xattr_cache[xattr_key(name)];
+ hlist_for_each_entry_safe(cx, tmp, h, node) {
+ if (!strcmp(cx->key, name)) {
+ hlist_del(&cx->node);
+ kfree(cx);
+ break;
+ }
+ }
+
out_unlock:
up_write(&orangefs_inode->xattr_sem);
return ret;
@@ -226,6 +317,9 @@ int orangefs_inode_setxattr(struct inode *inode, const char *name,
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
struct orangefs_kernel_op_s *new_op;
int internal_flag = 0;
+ struct orangefs_cached_xattr *cx;
+ struct hlist_head *h;
+ struct hlist_node *tmp;
int ret = -ENOMEM;

gossip_debug(GOSSIP_XATTR_DEBUG,
@@ -287,6 +381,16 @@ int orangefs_inode_setxattr(struct inode *inode, const char *name,

/* when request is serviced properly, free req op struct */
op_release(new_op);
+
+ h = &orangefs_inode->xattr_cache[xattr_key(name)];
+ hlist_for_each_entry_safe(cx, tmp, h, node) {
+ if (!strcmp(cx->key, name)) {
+ hlist_del(&cx->node);
+ kfree(cx);
+ break;
+ }
+ }
+
out_unlock:
up_write(&orangefs_inode->xattr_sem);
return ret;
--
2.19.0


2018-10-07 23:31:58

by Martin Brandenburg

[permalink] [raw]
Subject: [PATCH 06/19] orangefs: set up and use backing_dev_info

Signed-off-by: Martin Brandenburg <[email protected]>
---
fs/orangefs/super.c | 21 ++++++++++++++-------
1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 4c36481208f5..61bec955b285 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -406,15 +406,11 @@ static int orangefs_fill_sb(struct super_block *sb,
struct orangefs_fs_mount_response *fs_mount,
void *data, int silent)
{
- int ret = -EINVAL;
- struct inode *root = NULL;
- struct dentry *root_dentry = NULL;
+ int ret;
+ struct inode *root;
+ struct dentry *root_dentry;
struct orangefs_object_kref root_object;

- /* alloc and init our private orangefs sb info */
- sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL);
- if (!ORANGEFS_SB(sb))
- return -ENOMEM;
ORANGEFS_SB(sb)->sb = sb;

ORANGEFS_SB(sb)->root_khandle = fs_mount->root_khandle;
@@ -437,6 +433,10 @@ static int orangefs_fill_sb(struct super_block *sb,
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_maxbytes = MAX_LFS_FILESIZE;

+ ret = super_setup_bdi(sb);
+ if (ret)
+ return ret;
+
root_object.khandle = ORANGEFS_SB(sb)->root_khandle;
root_object.fs_id = ORANGEFS_SB(sb)->fs_id;
gossip_debug(GOSSIP_SUPER_DEBUG,
@@ -515,6 +515,13 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
goto free_op;
}

+ /* alloc and init our private orangefs sb info */
+ sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL);
+ if (!ORANGEFS_SB(sb)) {
+ d = ERR_PTR(-ENOMEM);
+ goto free_op;
+ }
+
ret = orangefs_fill_sb(sb,
&new_op->downcall.resp.fs_mount, data,
flags & SB_SILENT ? 1 : 0);
--
2.19.0


2018-10-07 23:32:23

by Martin Brandenburg

[permalink] [raw]
Subject: [PATCH 10/19] orangefs: service ops done for writeback are not killable

Signed-off-by: Martin Brandenburg <[email protected]>
---
fs/orangefs/orangefs-kernel.h | 1 +
fs/orangefs/orangefs-utils.c | 2 +-
fs/orangefs/waitqueue.c | 18 ++++++++++--------
3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 9fe60d086e2d..cdb08e51a4b1 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -441,6 +441,7 @@ extern const struct dentry_operations orangefs_dentry_operations;
#define ORANGEFS_OP_CANCELLATION 4 /* this is a cancellation */
#define ORANGEFS_OP_NO_MUTEX 8 /* don't acquire request_mutex */
#define ORANGEFS_OP_ASYNC 16 /* Queue it, but don't wait */
+#define ORANGEFS_OP_WRITEBACK 32

int service_operation(struct orangefs_kernel_op_s *op,
const char *op_name,
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index a4fac527f85d..9221c4a3398e 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -430,7 +430,7 @@ int orangefs_inode_setattr(struct inode *inode)
spin_unlock(&inode->i_lock);

ret = service_operation(new_op, __func__,
- get_interruptible_flag(inode));
+ get_interruptible_flag(inode) | ORANGEFS_OP_WRITEBACK);
gossip_debug(GOSSIP_UTILS_DEBUG,
"orangefs_inode_setattr: returning %d\n", ret);
if (ret)
diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c
index 0729d2645d6a..beafc33d57be 100644
--- a/fs/orangefs/waitqueue.c
+++ b/fs/orangefs/waitqueue.c
@@ -19,7 +19,7 @@

static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op,
long timeout,
- bool interruptible)
+ int flags)
__acquires(op->lock);
static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *op)
__releases(op->lock);
@@ -143,9 +143,7 @@ int service_operation(struct orangefs_kernel_op_s *op,
if (!(flags & ORANGEFS_OP_NO_MUTEX))
mutex_unlock(&orangefs_request_mutex);

- ret = wait_for_matching_downcall(op, timeout,
- flags & ORANGEFS_OP_INTERRUPTIBLE);
-
+ ret = wait_for_matching_downcall(op, timeout, flags);
gossip_debug(GOSSIP_WAIT_DEBUG,
"%s: wait_for_matching_downcall returned %d for %p\n",
__func__,
@@ -319,10 +317,12 @@ static void
*/
static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op,
long timeout,
- bool interruptible)
+ int flags)
__acquires(op->lock)
{
long n;
+ int writeback = flags & ORANGEFS_OP_WRITEBACK,
+ interruptible = flags & ORANGEFS_OP_INTERRUPTIBLE;

/*
* There's a "schedule_timeout" inside of these wait
@@ -330,10 +330,12 @@ static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op,
* user process that needs something done and is being
* manipulated by the client-core process.
*/
- if (interruptible)
+ if (writeback)
+ n = wait_for_completion_io_timeout(&op->waitq, timeout);
+ else if (!writeback && interruptible)
n = wait_for_completion_interruptible_timeout(&op->waitq,
- timeout);
- else
+ timeout);
+ else /* !writeback && !interruptible but compiler complains */
n = wait_for_completion_killable_timeout(&op->waitq, timeout);

spin_lock(&op->lock);
--
2.19.0


2018-10-08 20:44:44

by Mike Marshall

[permalink] [raw]
Subject: Re: [PATCH 00/19] [V2] orangefs: page cache

I still hit writepages_callback not private! on xfstests generic/127...

-Mike
On Sun, Oct 7, 2018 at 7:27 PM Martin Brandenburg <[email protected]> wrote:
>
> V2... see https://marc.info/?l=linux-fsdevel&m=153721507330730&w=2
>
> One important change is the following, without which an unaligned write
> may end up written to the beginning of its page. Surprisingly xfstests
> did not catch this. This was caught by an invalidate_inode_pages2 call
> in read_iter (not part of this patch series) which exposed the bug.
>
> diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
> index 34b98d2ed377..cd1263c45bb2 100644
> --- a/fs/orangefs/inode.c
> +++ b/fs/orangefs/inode.c
> @@ -56,7 +54,7 @@ static int orangefs_writepage_locked(struct page *page,
>
> bv.bv_page = page;
> bv.bv_len = wlen;
> - bv.bv_offset = 0;
> + bv.bv_offset = off % PAGE_SIZE;
> iov_iter_bvec(&iter, ITER_BVEC | WRITE, &bv, 1, wlen);
>
> ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen,
>
> The other big thing is an improved releasepage implementation and a
> launder_page implementation.
>
> We also size the writepages buffer based on the client core buffer.
> However I'm not happy with the memory allocation in writepages.
>
> Mike, I know you've had some trouble with the original series. I'd like
> to know if this fixes that.
>
> Martin Brandenburg (19):
> orangefs: implement xattr cache
> orangefs: do not invalidate attributes on inode create
> orangefs: simplify orangefs_inode_getattr interface
> orangefs: update attributes rather than relying on server
> orangefs: hold i_lock during inode_getattr
> orangefs: set up and use backing_dev_info
> orangefs: let setattr write to cached inode
> orangefs: reorganize setattr functions to track attribute changes
> orangefs: remove orangefs_readpages
> orangefs: service ops done for writeback are not killable
> orangefs: migrate to generic_file_read_iter
> orangefs: implement writepage
> orangefs: skip inode writeout if nothing to write
> orangefs: write range tracking
> orangefs: avoid fsync service operation on flush
> orangefs: use kmem_cache for orangefs_write_request
> orangefs: implement writepages
> orangefs: use client-core buffer size to determine writepages count
> orangefs: do writepages_work if a single page must be written
>
> fs/orangefs/acl.c | 4 +-
> fs/orangefs/file.c | 194 +++--------
> fs/orangefs/inode.c | 628 ++++++++++++++++++++++++++++------
> fs/orangefs/namei.c | 41 +--
> fs/orangefs/orangefs-cache.c | 24 +-
> fs/orangefs/orangefs-kernel.h | 56 ++-
> fs/orangefs/orangefs-mod.c | 10 +-
> fs/orangefs/orangefs-utils.c | 181 +++++-----
> fs/orangefs/super.c | 38 +-
> fs/orangefs/waitqueue.c | 18 +-
> fs/orangefs/xattr.c | 104 ++++++
> 11 files changed, 893 insertions(+), 405 deletions(-)
>
> --
> 2.19.0
>

2018-10-08 23:17:01

by Martin Brandenburg

[permalink] [raw]
Subject: Re: [PATCH 00/19] [V2] orangefs: page cache

On Mon, Oct 08, 2018 at 04:43:36PM -0400, Mike Marshall wrote:
> I still hit writepages_callback not private! on xfstests generic/127...

There's six fsx runs in there. Can you figure out which one fails?

Martin

2018-10-09 23:44:16

by Martin Brandenburg

[permalink] [raw]
Subject: invalidatepage questions (was Re: [PATCH 14/19] orangefs: write range tracking)

I have some questions about my assumptions writing this code. I
definitely have something wrong.

(Given my comments in invalidatepage, it should be clear I'm not quite
confident that it's doing the right thing.)

I thought writpeage could not be called while PagePrivate is not set.
It is set during a write to reflect the changed data, then cleared in
writepage as the write is done.

It is also cleared in invalidatepage since the write is no longer
necessary. However, I see writepage calls after an invalidatepage
causing a ClearPagePrivate. These naturally hit the BUG().

Running xfstests generic/127 I see a page with a pending write from
2110 to 4096 (end of page) and then an invalidate on that page from 744
to 4096. I ClearPagePrivate since there is no longer any data to write
from that page. But the kernel doens't know that and calls writepage.

Should I ClearPageDirty too?

Should I just ClearPagePrivate and then use the fact that it's not set
as a hint not to do anything in writepage?

Something I'm not thinking of?

Martin

On Sun, Oct 07, 2018 at 11:27:31PM +0000, Martin Brandenburg wrote:
> This is necessary to ensure the uid/gid responsible for the write is
> communicated with the server. Only one uid/gid may have outstanding
> changes at a time. If another uid/gid writes while there are
> outstanding changes, the changes must be written out before the new
> data is put into the page.
>
> Signed-off-by: Martin Brandenburg <[email protected]>
> ---
> fs/orangefs/file.c | 12 +-
> fs/orangefs/inode.c | 267 ++++++++++++++++++++++++++++++----
> fs/orangefs/orangefs-kernel.h | 12 +-
> 3 files changed, 261 insertions(+), 30 deletions(-)
>
> diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
> index ba580a5c6fd2..5eda483263ae 100644
> --- a/fs/orangefs/file.c
> +++ b/fs/orangefs/file.c
> @@ -46,8 +46,8 @@ static int flush_racache(struct inode *inode)
> * Post and wait for the I/O upcall to finish
> */
> ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
> - loff_t *offset, struct iov_iter *iter,
> - size_t total_size, loff_t readahead_size)
> + loff_t *offset, struct iov_iter *iter, size_t total_size,
> + loff_t readahead_size, struct orangefs_write_request *wr)
> {
> struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
> struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
> @@ -103,6 +103,10 @@ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
> __func__, (long)ret);
> goto out;
> }
> + if (wr) {
> + new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid);
> + new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid);
> + }
> }
>
> gossip_debug(GOSSIP_FILE_DEBUG,
> @@ -292,7 +296,7 @@ ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file,
> (int)*offset);
>
> ret = wait_for_direct_io(type, inode, offset, iter,
> - each_count, 0);
> + each_count, 0, NULL);
> gossip_debug(GOSSIP_FILE_DEBUG,
> "%s(%pU): return from wait_for_io:%d\n",
> __func__,
> @@ -434,7 +438,7 @@ static vm_fault_t orangefs_fault(struct vm_fault *vmf)
> static const struct vm_operations_struct orangefs_file_vm_ops = {
> .fault = orangefs_fault,
> .map_pages = filemap_map_pages,
> - .page_mkwrite = filemap_page_mkwrite,
> + .page_mkwrite = orangefs_page_mkwrite,
> };
>
> /*
> diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
> index bd2ce18453f2..5c155b259b13 100644
> --- a/fs/orangefs/inode.c
> +++ b/fs/orangefs/inode.c
> @@ -15,9 +15,11 @@
> #include "orangefs-kernel.h"
> #include "orangefs-bufmap.h"
>
> -static int orangefs_writepage(struct page *page, struct writeback_control *wbc)
> +static int orangefs_writepage_locked(struct page *page,
> + struct writeback_control *wbc)
> {
> struct inode *inode = page->mapping->host;
> + struct orangefs_write_request *wr;
> struct iov_iter iter;
> struct bio_vec bv;
> size_t len, wlen;
> @@ -26,33 +28,175 @@ static int orangefs_writepage(struct page *page, struct writeback_control *wbc)
>
> set_page_writeback(page);
>
> - off = page_offset(page);
> - len = i_size_read(inode);
> - if (off + PAGE_SIZE > len)
> - wlen = len - off;
> - else
> - wlen = PAGE_SIZE;
> + if (PagePrivate(page)) {
> + wr = (struct orangefs_write_request *)page_private(page);
> + BUG_ON(!wr);
> + if (wr->mwrite) {
> + off = page_offset(page);
> + len = i_size_read(inode);
> + if (off + PAGE_SIZE > len)
> + wlen = len - off;
> + else
> + wlen = PAGE_SIZE;
> + } else {
> + off = wr->pos;
> + wlen = wr->len;
> + len = i_size_read(inode);
> + }
> + } else {
> +/* BUG();*/
> + /* It's not private so there's nothing to write, right? */
> + printk("writepage not private!\n");
> + end_page_writeback(page);
> + return 0;
> +
> + }
>
> bv.bv_page = page;
> bv.bv_len = wlen;
> bv.bv_offset = off % PAGE_SIZE;
> - if (wlen == 0)
> - dump_stack();
> iov_iter_bvec(&iter, ITER_BVEC | WRITE, &bv, 1, wlen);
>
> ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen,
> - len);
> + len, wr);
> if (ret < 0) {
> SetPageError(page);
> mapping_set_error(page->mapping, ret);
> } else {
> ret = 0;
> + if (wr) {
> + ClearPagePrivate(page);
> + kfree(wr);
> + }
> }
> end_page_writeback(page);
> - unlock_page(page);
> return ret;
> }
>
> +static int do_writepage_if_necessary(struct page *page, loff_t pos,
> + unsigned len)
> +{
> + struct orangefs_write_request *wr;
> + struct writeback_control wbc = {
> + .sync_mode = WB_SYNC_ALL,
> + .nr_to_write = 0,
> + };
> + int r;
> + if (PagePrivate(page)) {
> + wr = (struct orangefs_write_request *)page_private(page);
> + BUG_ON(!wr);
> + /*
> + * If the new request is not contiguous with the last one or if
> + * the uid or gid is different, the page must be written out
> + * before continuing.
> + */
> + if (pos + len < wr->pos || wr->pos + wr->len < pos ||
> + !uid_eq(current_fsuid(), wr->uid) ||
> + !gid_eq(current_fsgid(), wr->gid)) {
> + wbc.range_start = page_file_offset(page);
> + wbc.range_end = wbc.range_start + PAGE_SIZE - 1;
> + wait_on_page_writeback(page);
> + if (clear_page_dirty_for_io(page)) {
> + r = orangefs_writepage_locked(page, &wbc);
> + if (r)
> + return r;
> + }
> + BUG_ON(PagePrivate(page));
> + }
> + }
> + return 0;
> +}
> +
> +static int update_wr(struct page *page, loff_t pos, unsigned len, int mwrite)
> +{
> + struct orangefs_write_request *wr;
> + if (PagePrivate(page)) {
> + wr = (struct orangefs_write_request *)page_private(page);
> + BUG_ON(!wr);
> + if (mwrite) {
> + wr->mwrite = 1;
> + return 0;
> + }
> + if (pos < wr->pos) {
> + wr->len += wr->pos - pos;
> + wr->pos = pos;
> + }
> + if (pos + len > wr->pos + wr->len)
> + wr->len = pos + len - wr->pos;
> + else
> + wr->len = wr->pos + wr->len - wr->pos;
> + } else {
> + wr = kmalloc(sizeof *wr, GFP_KERNEL);
> + if (wr) {
> + wr->pos = pos;
> + wr->len = len;
> + wr->uid = current_fsuid();
> + wr->gid = current_fsgid();
> + wr->mwrite = mwrite;
> + SetPagePrivate(page);
> + set_page_private(page, (unsigned long)wr);
> + } else {
> + return -ENOMEM;
> + }
> + }
> + return 0;
> +}
> +
> +int orangefs_page_mkwrite(struct vm_fault *vmf)
> +{
> + struct page *page = vmf->page;
> + struct inode *inode = file_inode(vmf->vma->vm_file);
> + unsigned len;
> + int r;
> +
> + /* Do not write past the file size. */
> + len = i_size_read(inode) - page_file_offset(page);
> + if (len > PAGE_SIZE)
> + len = PAGE_SIZE;
> +
> + lock_page(page);
> + r = do_writepage_if_necessary(page, page_file_offset(page),
> + len);
> + if (r) {
> + r = VM_FAULT_RETRY;
> + unlock_page(vmf->page);
> + return r;
> + }
> + r = update_wr(page, page_file_offset(page), len, 1);
> + if (r) {
> + r = VM_FAULT_RETRY;
> + unlock_page(vmf->page);
> + return r;
> + }
> +
> + r = VM_FAULT_LOCKED;
> + sb_start_pagefault(inode->i_sb);
> + file_update_time(vmf->vma->vm_file);
> + if (page->mapping != inode->i_mapping) {
> + unlock_page(page);
> + r = VM_FAULT_NOPAGE;
> + goto out;
> + }
> + /*
> + * We mark the page dirty already here so that when freeze is in
> + * progress, we are guaranteed that writeback during freezing will
> + * see the dirty page and writeprotect it again.
> + */
> + set_page_dirty(page);
> + wait_for_stable_page(page);
> +out:
> + sb_end_pagefault(inode->i_sb);
> + return r;
> +}
> +
> +static int orangefs_writepage(struct page *page, struct writeback_control *wbc)
> +{
> + int r;
> + r = orangefs_writepage_locked(page, wbc);
> + unlock_page(page);
> + return r;
> +}
> +
> static int orangefs_readpage(struct file *file, struct page *page)
> {
> struct inode *inode = page->mapping->host;
> @@ -68,7 +212,7 @@ static int orangefs_readpage(struct file *file, struct page *page)
> iov_iter_bvec(&iter, ITER_BVEC | READ, &bv, 1, PAGE_SIZE);
>
> ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
> - PAGE_SIZE, inode->i_size);
> + PAGE_SIZE, inode->i_size, NULL);
> /* this will only zero remaining unread portions of the page data */
> iov_iter_zero(~0U, &iter);
> /* takes care of potential aliasing */
> @@ -86,10 +230,26 @@ static int orangefs_readpage(struct file *file, struct page *page)
> return ret;
> }
>
> +static int orangefs_write_begin(struct file *file,
> + struct address_space *mapping, loff_t pos, unsigned len, unsigned flags,
> + struct page **pagep, void **fsdata)
> +{
> + int r;
> + r = simple_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
> + if (r)
> + return r;
> + r = do_writepage_if_necessary(*pagep, pos, len);
> + if (r)
> + unlock_page(*pagep);
> + return r;
> +}
> +
> int orangefs_write_end(struct file *file, struct address_space *mapping,
> loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata)
> {
> int r;
> + if (update_wr(page, pos, len, 0))
> + return -ENOMEM;
> r = simple_write_end(file, mapping, pos, len, copied, page, fsdata);
> mark_inode_dirty_sync(file_inode(file));
> return r;
> @@ -99,24 +259,68 @@ static void orangefs_invalidatepage(struct page *page,
> unsigned int offset,
> unsigned int length)
> {
> - gossip_debug(GOSSIP_INODE_DEBUG,
> - "orangefs_invalidatepage called on page %p "
> - "(offset is %u)\n",
> - page,
> - offset);
> -
> - ClearPageUptodate(page);
> - ClearPageMappedToDisk(page);
> + struct orangefs_write_request *wr;
> + /* XXX move to releasepage and call + rebase */
> + struct writeback_control wbc = {
> + .sync_mode = WB_SYNC_ALL,
> + .nr_to_write = 0,
> + };
> + int r;
> + if (PagePrivate(page)) {
> + wr = (struct orangefs_write_request *)page_private(page);
> + BUG_ON(!wr);
> +/* XXX prove */
> + if (offset == 0 && length == PAGE_SIZE) {
> + ClearPagePrivate(page);
> + kfree(wr);
> + } else if (wr->pos - page_offset(page) < offset &&
> + wr->pos - page_offset(page) + wr->len > offset + length) {
> + wbc.range_start = page_file_offset(page);
> + wbc.range_end = wbc.range_start + PAGE_SIZE - 1;
> + wait_on_page_writeback(page);
> + if (clear_page_dirty_for_io(page)) {
> + r = orangefs_writepage_locked(page, &wbc);
> + if (r)
> + return;
> + } else {
> + ClearPagePrivate(page);
> + kfree(wr);
> + }
> + } else if (wr->pos - page_offset(page) < offset &&
> + wr->pos - page_offset(page) + wr->len <= offset + length) {
> + wr->len = offset;
> + } else if (wr->pos - page_offset(page) >= offset &&
> + wr->pos - page_offset(page) + wr->len > offset + length) {
> + wr->pos += length - wr->pos + page_offset(page);
> + wr->len -= length - wr->pos + page_offset(page);
> + } else {
> + /*
> + * Invalidate range is bigger than write range but
> + * entire write range is to be invalidated.
> + */
> + ClearPagePrivate(page);
> + kfree(wr);
> + }
> + }
> return;
>
> }
>
> static int orangefs_releasepage(struct page *page, gfp_t foo)
> {
> - gossip_debug(GOSSIP_INODE_DEBUG,
> - "orangefs_releasepage called on page %p\n",
> - page);
> - return 0;
> + /*
> + * Two cases are mentioned in vfs.txt. Only one is relevant
> + * "VM finds a clean page with no active users and wants to make it a
> + * free page" However this page will not be private.
> + * "request has been made to invalidate some or all pages in an
> + * address_space" So we call orangefs_invalidatepage.
> + */
> + if (PagePrivate(page)) {
> + orangefs_invalidatepage(page, 0, PAGE_SIZE);
> + return !PagePrivate(page);
> + } else {
> + return 1;
> + }
> }
>
> static ssize_t orangefs_direct_IO(struct kiocb *iocb,
> @@ -128,16 +332,29 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
> ORANGEFS_IO_WRITE : ORANGEFS_IO_READ, file, &pos, iter);
> }
>
> +static int orangefs_launder_page(struct page *page)
> +{
> + int r = 0;
> + if (PagePrivate(page)) {
> + if (clear_page_dirty_for_io(page))
> + r = orangefs_writepage_locked(page, NULL);
> + return r;
> + } else {
> + return 0;
> + }
> +}
> +
> /** ORANGEFS2 implementation of address space operations */
> static const struct address_space_operations orangefs_address_operations = {
> .writepage = orangefs_writepage,
> .readpage = orangefs_readpage,
> .set_page_dirty = __set_page_dirty_nobuffers,
> - .write_begin = simple_write_begin,
> + .write_begin = orangefs_write_begin,
> .write_end = orangefs_write_end,
> .invalidatepage = orangefs_invalidatepage,
> .releasepage = orangefs_releasepage,
> .direct_IO = orangefs_direct_IO,
> + .launder_page = orangefs_launder_page,
> };
>
> static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
> diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
> index e128500e33b4..2e9726d1de7d 100644
> --- a/fs/orangefs/orangefs-kernel.h
> +++ b/fs/orangefs/orangefs-kernel.h
> @@ -178,6 +178,14 @@ static inline void set_op_state_purged(struct orangefs_kernel_op_s *op)
> }
> }
>
> +struct orangefs_write_request {
> + loff_t pos;
> + unsigned len;
> + kuid_t uid;
> + kgid_t gid;
> + int mwrite;
> +};
> +
> /* per inode private orangefs info */
> struct orangefs_inode_s {
> struct orangefs_object_kref refn;
> @@ -341,6 +349,8 @@ void fsid_key_table_finalize(void);
> /*
> * defined in inode.c
> */
> +int orangefs_page_mkwrite(struct vm_fault *);
> +
> struct inode *orangefs_new_inode(struct super_block *sb,
> struct inode *dir,
> int mode,
> @@ -382,7 +392,7 @@ bool __is_daemon_in_service(void);
> * defined in file.c
> */
> ssize_t wait_for_direct_io(enum ORANGEFS_io_type, struct inode *, loff_t *,
> - struct iov_iter *, size_t, loff_t);
> + struct iov_iter *, size_t, loff_t, struct orangefs_write_request *);
> ssize_t do_readv_writev(enum ORANGEFS_io_type, struct file *, loff_t *,
> struct iov_iter *);
>
> --
> 2.19.0
>