LinuxLists.cc - [RFC Patch] fs: implement per-file drop caches

2012-05-30 14:37:07

Subject: [RFC Patch] fs: implement per-file drop caches

This is a draft patch of implementing per-file drop caches.

It introduces a new fcntl command F_DROP_CACHES to drop
file caches of a specific file. The reason is that currently
we only have a system-wide drop caches interface, it could
cause system-wide performance down if we drop all page caches
when we actually want to drop the caches of some huge file.

Below is small test case for this patch:

#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#define __USE_GNU
#include <fcntl.h>

int
main(int argc, char *argv[])
{
int fd;
fd = open(argv[1], O_RDONLY);
if (fd == -1) {
perror("open");
return 1;
}
printf("Before readahead:\n");
system("grep ^Cache /proc/meminfo");
if (readahead(fd, 0, 1024*1024*100)) {
perror("open");
return 1;
}
printf("Before drop cache:\n");
system("grep ^Cache /proc/meminfo");
fcntl(fd, 1024+9, 3);
printf("After drop cache:\n");
system("grep ^Cache /proc/meminfo");
close(fd);
return 0;
}

I used a file of 100M size for testing, and I can see
the cache size of the whole system drops 70000K after
dropping the caches of this big file.

Any comments?

Signed-off-by: Cong Wang <[email protected]>
Cc: Alexander Viro <[email protected]>
Cc: Matthew Wilcox <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]

---
fs/dcache.c | 56 ++++++++++++++++++++++++++++++------------------
fs/drop_caches.c | 30 ++++++++++++++++++++++++++
fs/fcntl.c | 4 +++
fs/inode.c | 37 ++++++++++++++++++++++++++++++++
include/linux/fcntl.h | 1 +
include/linux/fs.h | 2 +
include/linux/mm.h | 1 +
7 files changed, 110 insertions(+), 21 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 4435d8b..5262851 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -585,28 +585,14 @@ kill_it:
}
EXPORT_SYMBOL(dput);

-/**
- * d_invalidate - invalidate a dentry
- * @dentry: dentry to invalidate
- *
- * Try to invalidate the dentry if it turns out to be
- * possible. If there are other dentries that can be
- * reached through this one we can't delete it and we
- * return -EBUSY. On success we return 0.
- *
- * no dcache lock.
- */
-
-int d_invalidate(struct dentry * dentry)
+int __d_invalidate(struct dentry * dentry)
{
/*
* If it's already been dropped, return OK.
*/
- spin_lock(&dentry->d_lock);
- if (d_unhashed(dentry)) {
- spin_unlock(&dentry->d_lock);
+ if (d_unhashed(dentry))
return 0;
- }
+
/*
* Check whether to do a partial shrink_dcache
* to get rid of unused child entries.
@@ -630,16 +616,33 @@ int d_invalidate(struct dentry * dentry)
* directory or not.
*/
if (dentry->d_count > 1 && dentry->d_inode) {
- if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) {
- spin_unlock(&dentry->d_lock);
+ if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry))
return -EBUSY;
- }
}

__d_drop(dentry);
- spin_unlock(&dentry->d_lock);
return 0;
}
+
+/**
+ * d_invalidate - invalidate a dentry
+ * @dentry: dentry to invalidate
+ *
+ * Try to invalidate the dentry if it turns out to be
+ * possible. If there are other dentries that can be
+ * reached through this one we can't delete it and we
+ * return -EBUSY. On success we return 0.
+ *
+ * no dcache lock.
+ */
+int d_invalidate(struct dentry * dentry)
+{
+ int ret;
+ spin_lock(&dentry->d_lock);
+ ret = __d_invalidate(dentry);
+ spin_unlock(&dentry->d_lock);
+ return ret;
+}
EXPORT_SYMBOL(d_invalidate);

/* This must be called with d_lock held */
@@ -898,6 +901,17 @@ relock:
shrink_dentry_list(&tmp);
}

+void prune_dcache_one(struct dentry *dentry)
+{
+ spin_lock(&dentry->d_lock);
+ if (dentry->d_flags & DCACHE_REFERENCED)
+ dentry->d_flags &= ~DCACHE_REFERENCED;
+ dentry_lru_del(dentry);
+ dentry->d_flags |= DCACHE_SHRINK_LIST;
+ __d_invalidate(dentry);
+ spin_unlock(&dentry->d_lock);
+}
+
/**
* shrink_dcache_sb - shrink dcache for a superblock
* @sb: superblock
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index c00e055..805f150 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -65,3 +65,33 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
}
return 0;
}
+
+static void drop_pagecache_file(struct file *filp)
+{
+ struct inode *inode = filp->f_path.dentry->d_inode;
+
+ spin_lock(&inode->i_lock);
+ if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+ (inode->i_mapping->nrpages == 0)) {
+ spin_unlock(&inode->i_lock);
+ return;
+ }
+ __iget(inode);
+ spin_unlock(&inode->i_lock);
+ invalidate_mapping_pages(inode->i_mapping, 0, -1);
+ iput(inode);
+}
+
+
+void file_drop_caches(struct file *filp, unsigned long which)
+{
+ if (which & 1)
+ drop_pagecache_file(filp);
+
+ if (which & 2) {
+ struct dentry *dentry = filp->f_path.dentry;
+
+ prune_dcache_one(dentry);
+ prune_icache_one(dentry->d_inode);
+ }
+}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index d078b75..a97f10a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -420,6 +420,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
case F_GETPIPE_SZ:
err = pipe_fcntl(filp, cmd, arg);
break;
+ case F_DROP_CACHES:
+ err = 0;
+ file_drop_caches(filp, arg);
+ break;
default:
break;
}
diff --git a/fs/inode.c b/fs/inode.c
index 6bc8761..a9e92bb 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -776,6 +776,43 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan)
dispose_list(&freeable);
}

+void prune_icache_one(struct inode *inode)
+{
+ unsigned long reap = 0;
+
+ /* We are still holding this inode, and we are
+ * expecting the last iput() will finally
+ * evict it.
+ */
+ spin_lock(&inode->i_lock);
+
+ if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+ spin_unlock(&inode->i_lock);
+ return;
+ }
+
+ if (inode->i_state & I_REFERENCED)
+ inode->i_state &= ~I_REFERENCED;
+
+ inode_lru_list_del(inode);
+
+ if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+ __iget(inode);
+ spin_unlock(&inode->i_lock);
+ if (remove_inode_buffers(inode))
+ reap += invalidate_mapping_pages(&inode->i_data,
+ 0, -1);
+ iput(inode);
+ } else
+ spin_unlock(&inode->i_lock);
+
+ if (reap) {
+ __count_vm_events(PGINODESTEAL, reap);
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += reap;
+ }
+}
+
static void __wait_on_freeing_inode(struct inode *inode);
/*
* Called with the inode lock held.
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index f550f89..6f2b24b 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -27,6 +27,7 @@
#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7)
#define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8)

+#define F_DROP_CACHES (F_LINUX_SPECIFIC_BASE + 9)
/*
* Types of directory notifications that may be requested.
*/
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 038076b..d39e4b9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1538,6 +1538,8 @@ struct super_block {
/* superblock cache pruning functions */
extern void prune_icache_sb(struct super_block *sb, int nr_to_scan);
extern void prune_dcache_sb(struct super_block *sb, int nr_to_scan);
+extern void prune_icache_one(struct inode *inode);
+extern void prune_dcache_one(struct dentry *dentry);

extern struct timespec current_fs_time(struct super_block *sb);

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ce26716..1ad3fc1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1555,6 +1555,7 @@ int in_gate_area_no_mm(unsigned long addr);

int drop_caches_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
+void file_drop_caches(struct file *filp, unsigned long which);
unsigned long shrink_slab(struct shrink_control *shrink,
unsigned long nr_pages_scanned,
unsigned long lru_pages);

2012-05-30 15:14:13

by Pádraig Brady

[permalink] [raw]

Subject: Re: [RFC Patch] fs: implement per-file drop caches

On 05/30/2012 02:38 PM, Cong Wang wrote:
> This is a draft patch of implementing per-file drop caches.
>
> It introduces a new fcntl command F_DROP_CACHES to drop
> file caches of a specific file. The reason is that currently
> we only have a system-wide drop caches interface, it could
> cause system-wide performance down if we drop all page caches
> when we actually want to drop the caches of some huge file.

This is useful functionality.
Though isn't it already provided with POSIX_FADV_DONTNEED?

This functionality was added to GNU dd (8.11) a year ago:
http://git.sv.gnu.org/gitweb/?p=coreutils.git;a=commitdiff;h=5f31155

Here are the examples from that patch:

# Advise to drop cache for whole file
dd if=ifile iflag=nocache count=0

# Ensure drop cache for the whole file
dd of=ofile oflag=nocache conv=notrunc,fdatasync count=0

# Drop cache for part of file
dd if=ifile iflag=nocache skip=10 count=10 of=/dev/null

# Stream data using just the read-ahead cache
dd if=ifile of=ofile iflag=nocache oflag=nocache

cheers,
P?draig.

2012-05-30 15:35:56

by John Stoffel

[permalink] [raw]

Subject: Re: [RFC Patch] fs: implement per-file drop caches

Cong> This is a draft patch of implementing per-file drop caches.

Interesting. So can I do this from outside a process? I'm a
SysAdmin, so my POV is from noticing, finding and fixing performance
problems when the system is under pressure.

Cong> It introduces a new fcntl command F_DROP_CACHES to drop
Cong> file caches of a specific file. The reason is that currently
Cong> we only have a system-wide drop caches interface, it could
Cong> cause system-wide performance down if we drop all page caches
Cong> when we actually want to drop the caches of some huge file.

How can I tell how much cache is used by a file? And what is the
performance impact of this when run on a busy system? And what does
this patch buy us since I figure the VM should already be dropping
caches once the system comes under mem pressure...

Cong> Below is small test case for this patch:

Cong> #include <unistd.h>
Cong> #include <stdlib.h>
Cong> #include <stdio.h>
Cong> #define __USE_GNU
Cong> #include <fcntl.h>

Cong> int
Cong> main(int argc, char *argv[])
Cong> {
Cong> int fd;
Cong> fd = open(argv[1], O_RDONLY);
Cong> if (fd == -1) {
Cong> perror("open");
Cong> return 1;
Cong> }
Cong> printf("Before readahead:\n");
Cong> system("grep ^Cache /proc/meminfo");
Cong> if (readahead(fd, 0, 1024*1024*100)) {
Cong> perror("open");
Cong> return 1;
Cong> }
Cong> printf("Before drop cache:\n");
Cong> system("grep ^Cache /proc/meminfo");
Cong> fcntl(fd, 1024+9, 3);
Cong> printf("After drop cache:\n");
Cong> system("grep ^Cache /proc/meminfo");
Cong> close(fd);
Cong> return 0;
Cong> }

Cong> I used a file of 100M size for testing, and I can see
Cong> the cache size of the whole system drops 70000K after
Cong> dropping the caches of this big file.

Cong> Any comments?

Cong> Signed-off-by: Cong Wang <[email protected]>
Cong> Cc: Alexander Viro <[email protected]>
Cong> Cc: Matthew Wilcox <[email protected]>
Cong> Cc: [email protected]
Cong> Cc: [email protected]
Cong> Cc: [email protected]

Cong> ---
Cong> fs/dcache.c | 56 ++++++++++++++++++++++++++++++------------------
Cong> fs/drop_caches.c | 30 ++++++++++++++++++++++++++
Cong> fs/fcntl.c | 4 +++
Cong> fs/inode.c | 37 ++++++++++++++++++++++++++++++++
Cong> include/linux/fcntl.h | 1 +
Cong> include/linux/fs.h | 2 +
Cong> include/linux/mm.h | 1 +
Cong> 7 files changed, 110 insertions(+), 21 deletions(-)

Cong> diff --git a/fs/dcache.c b/fs/dcache.c
Cong> index 4435d8b..5262851 100644
Cong> --- a/fs/dcache.c
Cong> +++ b/fs/dcache.c
Cong> @@ -585,28 +585,14 @@ kill_it:
Cong> }
Cong> EXPORT_SYMBOL(dput);

Cong> -/**
Cong> - * d_invalidate - invalidate a dentry
Cong> - * @dentry: dentry to invalidate
Cong> - *
Cong> - * Try to invalidate the dentry if it turns out to be
Cong> - * possible. If there are other dentries that can be
Cong> - * reached through this one we can't delete it and we
Cong> - * return -EBUSY. On success we return 0.
Cong> - *
Cong> - * no dcache lock.
Cong> - */
Cong> -
Cong> -int d_invalidate(struct dentry * dentry)
Cong> +int __d_invalidate(struct dentry * dentry)
Cong> {
Cong> /*
Cong> * If it's already been dropped, return OK.
Cong> */
Cong> - spin_lock(&dentry->d_lock);
Cong> - if (d_unhashed(dentry)) {
Cong> - spin_unlock(&dentry->d_lock);
Cong> + if (d_unhashed(dentry))
Cong> return 0;
Cong> - }
Cong> +
Cong> /*
Cong> * Check whether to do a partial shrink_dcache
Cong> * to get rid of unused child entries.
Cong> @@ -630,16 +616,33 @@ int d_invalidate(struct dentry * dentry)
Cong> * directory or not.
Cong> */
Cong> if (dentry->d_count > 1 && dentry->d_inode) {
Cong> - if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) {
Cong> - spin_unlock(&dentry->d_lock);
Cong> + if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry))
Cong> return -EBUSY;
Cong> - }
Cong> }

Cong> __d_drop(dentry);
Cong> - spin_unlock(&dentry->d_lock);
Cong> return 0;
Cong> }
Cong> +
Cong> +/**
Cong> + * d_invalidate - invalidate a dentry
Cong> + * @dentry: dentry to invalidate
Cong> + *
Cong> + * Try to invalidate the dentry if it turns out to be
Cong> + * possible. If there are other dentries that can be
Cong> + * reached through this one we can't delete it and we
Cong> + * return -EBUSY. On success we return 0.
Cong> + *
Cong> + * no dcache lock.
Cong> + */
Cong> +int d_invalidate(struct dentry * dentry)
Cong> +{
Cong> + int ret;
Cong> + spin_lock(&dentry->d_lock);
Cong> + ret = __d_invalidate(dentry);
Cong> + spin_unlock(&dentry->d_lock);
Cong> + return ret;
Cong> +}
Cong> EXPORT_SYMBOL(d_invalidate);

Cong> /* This must be called with d_lock held */
Cong> @@ -898,6 +901,17 @@ relock:
Cong> shrink_dentry_list(&tmp);
Cong> }

Cong> +void prune_dcache_one(struct dentry *dentry)
Cong> +{
Cong> + spin_lock(&dentry->d_lock);
Cong> + if (dentry->d_flags & DCACHE_REFERENCED)
Cong> + dentry->d_flags &= ~DCACHE_REFERENCED;
Cong> + dentry_lru_del(dentry);
Cong> + dentry->d_flags |= DCACHE_SHRINK_LIST;
Cong> + __d_invalidate(dentry);
Cong> + spin_unlock(&dentry->d_lock);
Cong> +}
Cong> +
Cong> /**
Cong> * shrink_dcache_sb - shrink dcache for a superblock
Cong> * @sb: superblock
Cong> diff --git a/fs/drop_caches.c b/fs/drop_caches.c
Cong> index c00e055..805f150 100644
Cong> --- a/fs/drop_caches.c
Cong> +++ b/fs/drop_caches.c
Cong> @@ -65,3 +65,33 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
Cong> }
Cong> return 0;
Cong> }
Cong> +
Cong> +static void drop_pagecache_file(struct file *filp)
Cong> +{
Cong> + struct inode *inode = filp->f_path.dentry->d_inode;
Cong> +
Cong> + spin_lock(&inode->i_lock);
Cong> + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
Cong> + (inode->i_mapping->nrpages == 0)) {
Cong> + spin_unlock(&inode->i_lock);
Cong> + return;
Cong> + }
Cong> + __iget(inode);
Cong> + spin_unlock(&inode->i_lock);
Cong> + invalidate_mapping_pages(inode->i_mapping, 0, -1);
Cong> + iput(inode);
Cong> +}
Cong> +
Cong> +
Cong> +void file_drop_caches(struct file *filp, unsigned long which)
Cong> +{
Cong> + if (which & 1)
Cong> + drop_pagecache_file(filp);
Cong> +
Cong> + if (which & 2) {
Cong> + struct dentry *dentry = filp->f_path.dentry;
Cong> +
Cong> + prune_dcache_one(dentry);
Cong> + prune_icache_one(dentry->d_inode);
Cong> + }
Cong> +}
Cong> diff --git a/fs/fcntl.c b/fs/fcntl.c
Cong> index d078b75..a97f10a 100644
Cong> --- a/fs/fcntl.c
Cong> +++ b/fs/fcntl.c
Cong> @@ -420,6 +420,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
Cong> case F_GETPIPE_SZ:
Cong> err = pipe_fcntl(filp, cmd, arg);
Cong> break;
Cong> + case F_DROP_CACHES:
Cong> + err = 0;
Cong> + file_drop_caches(filp, arg);
Cong> + break;
Cong> default:
Cong> break;
Cong> }
Cong> diff --git a/fs/inode.c b/fs/inode.c
Cong> index 6bc8761..a9e92bb 100644
Cong> --- a/fs/inode.c
Cong> +++ b/fs/inode.c
Cong> @@ -776,6 +776,43 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan)
Cong> dispose_list(&freeable);
Cong> }

Cong> +void prune_icache_one(struct inode *inode)
Cong> +{
Cong> + unsigned long reap = 0;
Cong> +
Cong> + /* We are still holding this inode, and we are
Cong> + * expecting the last iput() will finally
Cong> + * evict it.
Cong> + */
Cong> + spin_lock(&inode->i_lock);
Cong> +
Cong> + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
Cong> + spin_unlock(&inode->i_lock);
Cong> + return;
Cong> + }
Cong> +
Cong> + if (inode->i_state & I_REFERENCED)
Cong> + inode->i_state &= ~I_REFERENCED;
Cong> +
Cong> + inode_lru_list_del(inode);
Cong> +
Cong> + if (inode_has_buffers(inode) || inode->i_data.nrpages) {
Cong> + __iget(inode);
Cong> + spin_unlock(&inode->i_lock);
Cong> + if (remove_inode_buffers(inode))
Cong> + reap += invalidate_mapping_pages(&inode->i_data,
Cong> + 0, -1);
Cong> + iput(inode);
Cong> + } else
Cong> + spin_unlock(&inode->i_lock);
Cong> +
Cong> + if (reap) {
Cong> + __count_vm_events(PGINODESTEAL, reap);
Cong> + if (current->reclaim_state)
Cong> + current->reclaim_state->reclaimed_slab += reap;
Cong> + }
Cong> +}
Cong> +
Cong> static void __wait_on_freeing_inode(struct inode *inode);
Cong> /*
Cong> * Called with the inode lock held.
Cong> diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
Cong> index f550f89..6f2b24b 100644
Cong> --- a/include/linux/fcntl.h
Cong> +++ b/include/linux/fcntl.h
Cong> @@ -27,6 +27,7 @@
Cong> #define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7)
Cong> #define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8)

Cong> +#define F_DROP_CACHES (F_LINUX_SPECIFIC_BASE + 9)
Cong> /*
Cong> * Types of directory notifications that may be requested.
Cong> */
Cong> diff --git a/include/linux/fs.h b/include/linux/fs.h
Cong> index 038076b..d39e4b9 100644
Cong> --- a/include/linux/fs.h
Cong> +++ b/include/linux/fs.h
Cong> @@ -1538,6 +1538,8 @@ struct super_block {
Cong> /* superblock cache pruning functions */
Cong> extern void prune_icache_sb(struct super_block *sb, int nr_to_scan);
Cong> extern void prune_dcache_sb(struct super_block *sb, int nr_to_scan);
Cong> +extern void prune_icache_one(struct inode *inode);
Cong> +extern void prune_dcache_one(struct dentry *dentry);

Cong> extern struct timespec current_fs_time(struct super_block *sb);

Cong> diff --git a/include/linux/mm.h b/include/linux/mm.h
Cong> index ce26716..1ad3fc1 100644
Cong> --- a/include/linux/mm.h
Cong> +++ b/include/linux/mm.h
Cong> @@ -1555,6 +1555,7 @@ int in_gate_area_no_mm(unsigned long addr);

Cong> int drop_caches_sysctl_handler(struct ctl_table *, int,
Cong> void __user *, size_t *, loff_t *);
Cong> +void file_drop_caches(struct file *filp, unsigned long which);
Cong> unsigned long shrink_slab(struct shrink_control *shrink,
Cong> unsigned long nr_pages_scanned,
Cong> unsigned long lru_pages);
Cong> --
Cong> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Cong> the body of a message to [email protected]
Cong> More majordomo info at http://vger.kernel.org/majordomo-info.html
Cong> Please read the FAQ at http://www.tux.org/lkml/

2012-05-31 06:20:49

by Cong Wang

[permalink] [raw]

Subject: Re: [RFC Patch] fs: implement per-file drop caches

On Wed, 2012-05-30 at 16:14 +0100, Pádraig Brady wrote:
> On 05/30/2012 02:38 PM, Cong Wang wrote:
> > This is a draft patch of implementing per-file drop caches.
> >
> > It introduces a new fcntl command F_DROP_CACHES to drop
> > file caches of a specific file. The reason is that currently
> > we only have a system-wide drop caches interface, it could
> > cause system-wide performance down if we drop all page caches
> > when we actually want to drop the caches of some huge file.
>
> This is useful functionality.
> Though isn't it already provided with POSIX_FADV_DONTNEED?

Thanks for teaching this!

However, from the source code of madvise_dontneed() it looks like it is
using a totally different way to drop page caches, that is to invalidate
the page mapping, and trigger a re-mapping of the file pages after a
page fault. So, yeah, this could probably drop the page caches too (I am
not so sure, haven't checked the code in details), but with my patch, it
flushes the page caches directly, what's more, it can also prune
dcache/icache of the file.

Cheers.

2012-05-31 06:28:38

by Cong Wang

[permalink] [raw]

Subject: Re: [RFC Patch] fs: implement per-file drop caches

On Wed, 2012-05-30 at 11:12 -0400, John Stoffel wrote:
> Cong> This is a draft patch of implementing per-file drop caches.
>
> Interesting. So can I do this from outside a process? I'm a
> SysAdmin, so my POV is from noticing, finding and fixing performance
> problems when the system is under pressure.

Yes, sure, we need to write a utility (or patch an existing one) to do
this for you admins.

>
> Cong> It introduces a new fcntl command F_DROP_CACHES to drop
> Cong> file caches of a specific file. The reason is that currently
> Cong> we only have a system-wide drop caches interface, it could
> Cong> cause system-wide performance down if we drop all page caches
> Cong> when we actually want to drop the caches of some huge file.
>
> How can I tell how much cache is used by a file? And what is the
> performance impact of this when run on a busy system? And what does
> this patch buy us since I figure the VM should already be dropping
> caches once the system comes under mem pressure...
>

AFAIK, we don't export such information to user-space, we only have
system-wide statistics.

Keiichi (in Cc) once wrote a patch to implement page cache tracepoint:
http://marc.info/?l=linux-mm&m=131102496904326&w=3

but the patches are still not in upstream.

Thanks!

2012-05-31 06:30:26

by KOSAKI Motohiro

[permalink] [raw]

Subject: Re: [RFC Patch] fs: implement per-file drop caches

(5/31/12 2:20 AM), Cong Wang wrote:
> On Wed, 2012-05-30 at 16:14 +0100, Pádraig Brady wrote:
>> On 05/30/2012 02:38 PM, Cong Wang wrote:
>>> This is a draft patch of implementing per-file drop caches.
>>>
>>> It introduces a new fcntl command F_DROP_CACHES to drop
>>> file caches of a specific file. The reason is that currently
>>> we only have a system-wide drop caches interface, it could
>>> cause system-wide performance down if we drop all page caches
>>> when we actually want to drop the caches of some huge file.
>>
>> This is useful functionality.
>> Though isn't it already provided with POSIX_FADV_DONTNEED?
>
> Thanks for teaching this!
>
> However, from the source code of madvise_dontneed() it looks like it is
> using a totally different way to drop page caches, that is to invalidate
> the page mapping, and trigger a re-mapping of the file pages after a
> page fault. So, yeah, this could probably drop the page caches too (I am
> not so sure, haven't checked the code in details), but with my patch, it
> flushes the page caches directly, what's more, it can also prune
> dcache/icache of the file.

madvise should work. I don't think we need duplicate interface. Moreomover
madvise(2) is cleaner than fcntl(2).

2012-05-31 12:11:38

by Cong Wang

[permalink] [raw]

Subject: Re: [RFC Patch] fs: implement per-file drop caches

On Thu, 2012-05-31 at 02:30 -0400, KOSAKI Motohiro wrote:
> (5/31/12 2:20 AM), Cong Wang wrote:
> > On Wed, 2012-05-30 at 16:14 +0100, Pádraig Brady wrote:
> >> On 05/30/2012 02:38 PM, Cong Wang wrote:
> >>> This is a draft patch of implementing per-file drop caches.
> >>>
> >>> It introduces a new fcntl command F_DROP_CACHES to drop
> >>> file caches of a specific file. The reason is that currently
> >>> we only have a system-wide drop caches interface, it could
> >>> cause system-wide performance down if we drop all page caches
> >>> when we actually want to drop the caches of some huge file.
> >>
> >> This is useful functionality.
> >> Though isn't it already provided with POSIX_FADV_DONTNEED?
> >
> > Thanks for teaching this!
> >
> > However, from the source code of madvise_dontneed() it looks like it is
> > using a totally different way to drop page caches, that is to invalidate
> > the page mapping, and trigger a re-mapping of the file pages after a
> > page fault. So, yeah, this could probably drop the page caches too (I am
> > not so sure, haven't checked the code in details), but with my patch, it
> > flushes the page caches directly, what's more, it can also prune
> > dcache/icache of the file.
>
> madvise should work. I don't think we need duplicate interface. Moreomover
> madvise(2) is cleaner than fcntl(2).
>

I think madvise(DONTNEED) attacks the problem in a different approach,
it munmaps the file mapping and by the way drops the page caches, my
approach is to drop the page caches directly similar to what sysctl
drop_caches.

What about private file mapping? Could madvise(DONTNEED) drop the page
caches too even when the other process is doing the same private file
mapping? At least my patch could do this.

I am not sure if fcntl() is a good interface either, this is why the
patch is marked as RFC. :-D

Thanks!

2012-05-31 19:09:08

by KOSAKI Motohiro

[permalink] [raw]

Subject: Re: [RFC Patch] fs: implement per-file drop caches

(5/31/12 8:11 AM), Cong Wang wrote:
> On Thu, 2012-05-31 at 02:30 -0400, KOSAKI Motohiro wrote:
>> (5/31/12 2:20 AM), Cong Wang wrote:
>>> On Wed, 2012-05-30 at 16:14 +0100, Pádraig Brady wrote:
>>>> On 05/30/2012 02:38 PM, Cong Wang wrote:
>>>>> This is a draft patch of implementing per-file drop caches.
>>>>>
>>>>> It introduces a new fcntl command F_DROP_CACHES to drop
>>>>> file caches of a specific file. The reason is that currently
>>>>> we only have a system-wide drop caches interface, it could
>>>>> cause system-wide performance down if we drop all page caches
>>>>> when we actually want to drop the caches of some huge file.
>>>>
>>>> This is useful functionality.
>>>> Though isn't it already provided with POSIX_FADV_DONTNEED?
>>>
>>> Thanks for teaching this!
>>>
>>> However, from the source code of madvise_dontneed() it looks like it is
>>> using a totally different way to drop page caches, that is to invalidate
>>> the page mapping, and trigger a re-mapping of the file pages after a
>>> page fault. So, yeah, this could probably drop the page caches too (I am
>>> not so sure, haven't checked the code in details), but with my patch, it
>>> flushes the page caches directly, what's more, it can also prune
>>> dcache/icache of the file.
>>
>> madvise should work. I don't think we need duplicate interface. Moreomover
>> madvise(2) is cleaner than fcntl(2).
>>
>
> I think madvise(DONTNEED) attacks the problem in a different approach,
> it munmaps the file mapping and by the way drops the page caches, my
> approach is to drop the page caches directly similar to what sysctl
> drop_caches.
>
> What about private file mapping? Could madvise(DONTNEED) drop the page
> caches too even when the other process is doing the same private file
> mapping? At least my patch could do this.

Right. But a process can makes another mappings if a process have enough
permission. and if it doesn't, a process shouldn't be able to drop a shared
cache.

> I am not sure if fcntl() is a good interface either, this is why the
> patch is marked as RFC. :-D

But, if you can find certain usecase, I'm not against anymore.

2012-06-01 11:32:31

by Cong Wang

[permalink] [raw]

Subject: Re: [RFC Patch] fs: implement per-file drop caches

On Thu, 2012-05-31 at 15:09 -0400, KOSAKI Motohiro wrote:
> (5/31/12 8:11 AM), Cong Wang wrote:
> > On Thu, 2012-05-31 at 02:30 -0400, KOSAKI Motohiro wrote:
> >> (5/31/12 2:20 AM), Cong Wang wrote:
> >>> On Wed, 2012-05-30 at 16:14 +0100, Pádraig Brady wrote:
> >>>> On 05/30/2012 02:38 PM, Cong Wang wrote:
> >>>>> This is a draft patch of implementing per-file drop caches.
> >>>>>
> >>>>> It introduces a new fcntl command F_DROP_CACHES to drop
> >>>>> file caches of a specific file. The reason is that currently
> >>>>> we only have a system-wide drop caches interface, it could
> >>>>> cause system-wide performance down if we drop all page caches
> >>>>> when we actually want to drop the caches of some huge file.
> >>>>
> >>>> This is useful functionality.
> >>>> Though isn't it already provided with POSIX_FADV_DONTNEED?
> >>>
> >>> Thanks for teaching this!
> >>>
> >>> However, from the source code of madvise_dontneed() it looks like it is
> >>> using a totally different way to drop page caches, that is to invalidate
> >>> the page mapping, and trigger a re-mapping of the file pages after a
> >>> page fault. So, yeah, this could probably drop the page caches too (I am
> >>> not so sure, haven't checked the code in details), but with my patch, it
> >>> flushes the page caches directly, what's more, it can also prune
> >>> dcache/icache of the file.
> >>
> >> madvise should work. I don't think we need duplicate interface. Moreomover
> >> madvise(2) is cleaner than fcntl(2).
> >>
> >
> > I think madvise(DONTNEED) attacks the problem in a different approach,
> > it munmaps the file mapping and by the way drops the page caches, my
> > approach is to drop the page caches directly similar to what sysctl
> > drop_caches.
> >
> > What about private file mapping? Could madvise(DONTNEED) drop the page
> > caches too even when the other process is doing the same private file
> > mapping? At least my patch could do this.
>
> Right. But a process can makes another mappings if a process have enough
> permission. and if it doesn't, a process shouldn't be able to drop a shared
> cache.
>

Ok, then this patch is not a dup of madvise(DONTNEED).

>
> > I am not sure if fcntl() is a good interface either, this is why the
> > patch is marked as RFC. :-D
>
> But, if you can find certain usecase, I'm not against anymore.
>

Yeah, at least John Stoffel expressed his interests on this, as a
sysadmin. So I believe there are some people need it.

Now the problem is that I don't find a proper existing utility to patch,
maybe Pádraig has any hints on this? Could this feature be merged into
some core utility? Or I have to write a new utility for this?

Thanks.

2012-06-01 13:08:23

by John Stoffel

[permalink] [raw]

Subject: Re: [RFC Patch] fs: implement per-file drop caches

>>>>> "Cong" == Cong Wang <[email protected]> writes:

Cong> On Thu, 2012-05-31 at 15:09 -0400, KOSAKI Motohiro wrote:
>> (5/31/12 8:11 AM), Cong Wang wrote:
>> > On Thu, 2012-05-31 at 02:30 -0400, KOSAKI Motohiro wrote:
>> >> (5/31/12 2:20 AM), Cong Wang wrote:
>> >>> On Wed, 2012-05-30 at 16:14 +0100, P?draig Brady wrote:
>> >>>> On 05/30/2012 02:38 PM, Cong Wang wrote:
>> >>>>> This is a draft patch of implementing per-file drop caches.
>> >>>>>
>> >>>>> It introduces a new fcntl command F_DROP_CACHES to drop
>> >>>>> file caches of a specific file. The reason is that currently
>> >>>>> we only have a system-wide drop caches interface, it could
>> >>>>> cause system-wide performance down if we drop all page caches
>> >>>>> when we actually want to drop the caches of some huge file.
>> >>>>
>> >>>> This is useful functionality.
>> >>>> Though isn't it already provided with POSIX_FADV_DONTNEED?
>> >>>
>> >>> Thanks for teaching this!
>> >>>
>> >>> However, from the source code of madvise_dontneed() it looks like it is
>> >>> using a totally different way to drop page caches, that is to invalidate
>> >>> the page mapping, and trigger a re-mapping of the file pages after a
>> >>> page fault. So, yeah, this could probably drop the page caches too (I am
>> >>> not so sure, haven't checked the code in details), but with my patch, it
>> >>> flushes the page caches directly, what's more, it can also prune
>> >>> dcache/icache of the file.
>> >>
>> >> madvise should work. I don't think we need duplicate interface. Moreomover
>> >> madvise(2) is cleaner than fcntl(2).
>> >>
>> >
>> > I think madvise(DONTNEED) attacks the problem in a different approach,
>> > it munmaps the file mapping and by the way drops the page caches, my
>> > approach is to drop the page caches directly similar to what sysctl
>> > drop_caches.
>> >
>> > What about private file mapping? Could madvise(DONTNEED) drop the page
>> > caches too even when the other process is doing the same private file
>> > mapping? At least my patch could do this.
>>
>> Right. But a process can makes another mappings if a process have enough
>> permission. and if it doesn't, a process shouldn't be able to drop a shared
>> cache.
>>

Cong> Ok, then this patch is not a dup of madvise(DONTNEED).

>>
>> > I am not sure if fcntl() is a good interface either, this is why the
>> > patch is marked as RFC. :-D
>>
>> But, if you can find certain usecase, I'm not against anymore.
>>

Cong> Yeah, at least John Stoffel expressed his interests on this, as
Cong> a sysadmin. So I believe there are some people need it.

I expressed an interest if there was a way to usefully *find* the
processes that are hogging cache. Without a reporting mechanism of
cache usage on per-file or per-process manner, then I don't see a
great use for this. It's just simpler to drop all the caches when you
hit a wall.

Cong> Now the problem is that I don't find a proper existing utility
Cong> to patch, maybe P?draig has any hints on this? Could this
Cong> feature be merged into some core utility? Or I have to write a
Cong> new utility for this?

I'd write a new tutorial utility, maybe you could call it 'cache_top'
and have it both show the biggest users of cache, as well as exposing
your new ability to drop the cache on a per-fd basis.

It's really not much use unless we can measure it.

John

2012-06-04 03:28:57

by Cong Wang

[permalink] [raw]

Subject: Re: [RFC Patch] fs: implement per-file drop caches

On Fri, 2012-06-01 at 09:08 -0400, John Stoffel wrote:
> >>>>> "Cong" == Cong Wang <[email protected]> writes:
>
> Cong> Yeah, at least John Stoffel expressed his interests on this, as
> Cong> a sysadmin. So I believe there are some people need it.
>
> I expressed an interest if there was a way to usefully *find* the
> processes that are hogging cache. Without a reporting mechanism of
> cache usage on per-file or per-process manner, then I don't see a
> great use for this. It's just simpler to drop all the caches when you
> hit a wall.
>
> Cong> Now the problem is that I don't find a proper existing utility
> Cong> to patch, maybe Pádraig has any hints on this? Could this
> Cong> feature be merged into some core utility? Or I have to write a
> Cong> new utility for this?
>
> I'd write a new tutorial utility, maybe you could call it 'cache_top'
> and have it both show the biggest users of cache, as well as exposing
> your new ability to drop the cache on a per-fd basis.
>
> It's really not much use unless we can measure it.

Fair enough.

We could do that with Keiichi's page cache tracepoint patches:
https://lkml.org/lkml/2011/7/18/326

with that patch, we can measure page caches with `perf`. I tried to
carry Keiichi's patches, but those patch depend on other patches too,
the main problem is still translating the inode number to file name for
user-space users to read, which is not trivial at all.

Also, will vmtouch work for you too? You can get it at
http://hoytech.com/vmtouch/

I can patch it too if you want.

Thanks!