2011-04-26 21:35:39

by Andrea Righi

[permalink] [raw]
Subject: [RFC] [PATCH] drop_pagecache syscall

Introduce sys_drop_pagecache() system call to drop the page cache pages of
a single filesystem.

This new system call takes a file descriptor as argument and drops only
the page cache pages of the file system it references.

At the moment it is possible to drop page cache pages via
/proc/sys/vm/drop_pagecache or via posix_fadvise(POSIX_FADV_DONTNEED).

The first method drops the whole page cache while the second can be used
to drop page cache pages of a single file descriptor. But there's not a
simple way to drop all the pages of a filesystem (we could scan all the
file descriptors and use posix_fadvise(), but this solution doesn't scale
very well in some cases).

This functionality can be used by all the applications that want to have a
better control over the page cache management (for example to immediately drop
pages that for sure will not be reused in the near future, without calling
posix_fadvise() for all the files they've touched), or to provide a more fine
grained debugging feature usable by the filesystem benchmarks.

The system call does not require root privileges and it can be called by any
unprivileged application. For example, we can write a userspace tool to run
something like this:

$ drop-pagecache /path/file_or_dir

A practical example:

$ ls -lh /mnt/sda/zero /mnt/sdb/zero
-rw-r--r-- 1 root root 16M 2011-04-20 10:20 /mnt/sda/zero
-rw-r--r-- 1 root root 16M 2011-04-20 10:20 /mnt/sdb/zero

$ grep ^Cached /proc/meminfo
Cached: 5660 kB
$ md5sum /mnt/sda/zero /mnt/sdb/zero
2c7ab85a893283e98c931e9511add182 /mnt/sda/zero
2c7ab85a893283e98c931e9511add182 /mnt/sdb/zero
$ grep ^Cached /proc/meminfo
Cached: 38544 kB
$ ./drop-pagecache /mnt/sda/
$ grep ^Cached /proc/meminfo
Cached: 22440 kB
$ ./drop-pagecache /mnt/sdb/
$ grep ^Cached /proc/meminfo
Cached: 5056 kB

TODO:
- provide support also for the architectures different than x86/x86_64

Signed-off-by: Andrea Righi <[email protected]>
---
arch/x86/ia32/ia32entry.S | 1 +
arch/x86/include/asm/unistd_32.h | 3 ++-
arch/x86/include/asm/unistd_64.h | 2 ++
arch/x86/kernel/syscall_table_32.S | 1 +
fs/drop_caches.c | 24 ++++++++++++++++++++++++
include/asm-generic/unistd.h | 4 +++-
include/linux/syscalls.h | 1 +
7 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 849a9d2..d32f67c 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -848,4 +848,5 @@ ia32_sys_call_table:
.quad compat_sys_open_by_handle_at
.quad compat_sys_clock_adjtime
.quad sys_syncfs
+ .quad sys_drop_pagecache /* 345 */
ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index a755ef5..1c6630b 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -350,10 +350,11 @@
#define __NR_open_by_handle_at 342
#define __NR_clock_adjtime 343
#define __NR_syncfs 344
+#define __NR_drop_pagecache 345

#ifdef __KERNEL__

-#define NR_syscalls 345
+#define NR_syscalls 346

#define __ARCH_WANT_IPC_PARSE_VERSION
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 160fa76..3234734 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -677,6 +677,8 @@ __SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
#define __NR_syncfs 306
__SYSCALL(__NR_syncfs, sys_syncfs)
+#define __NR_drop_pagecache 307
+__SYSCALL(__NR_drop_pagecache, sys_drop_pagecache)

#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index abce34d..6355af6 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -344,3 +344,4 @@ ENTRY(sys_call_table)
.long sys_open_by_handle_at
.long sys_clock_adjtime
.long sys_syncfs
+ .long sys_drop_pagecache /* 345 */
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 98b77c8..ac043c7 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -6,6 +6,8 @@
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/writeback.h>
+#include <linux/file.h>
+#include <linux/syscalls.h>
#include <linux/sysctl.h>
#include <linux/gfp.h>
#include "internal.h"
@@ -37,6 +39,28 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
iput(toput_inode);
}

+/*
+ * Drop page cache of a single superblock
+ */
+SYSCALL_DEFINE1(drop_pagecache, int, fd)
+{
+ struct file *file;
+ struct super_block *sb;
+ int fput_needed;
+
+ file = fget_light(fd, &fput_needed);
+ if (!file)
+ return -EBADF;
+ sb = file->f_dentry->d_sb;
+
+ down_read(&sb->s_umount);
+ drop_pagecache_sb(sb, NULL);
+ up_read(&sb->s_umount);
+
+ fput_light(file, fput_needed);
+ return 0;
+}
+
static void drop_slab(void)
{
int nr_objects;
diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h
index 07c40d5..088ff08 100644
--- a/include/asm-generic/unistd.h
+++ b/include/asm-generic/unistd.h
@@ -654,9 +654,11 @@ __SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
#define __NR_syncfs 267
__SYSCALL(__NR_syncfs, sys_syncfs)
+#define __NR_drop_pagecache 268
+__SYSCALL(__NR_drop_pagecache, sys_drop_pagecache)

#undef __NR_syscalls
-#define __NR_syscalls 268
+#define __NR_syscalls 269

/*
* All syscalls below here should go away really,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 83ecc17..af2a5c7 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -826,6 +826,7 @@ asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags,
u64 mask, int fd,
const char __user *pathname);
asmlinkage long sys_syncfs(int fd);
+asmlinkage long sys_drop_pagecache(int fd);

int kernel_execve(const char *filename, const char *const argv[], const char *const envp[]);

--
1.7.1


2011-04-27 00:15:11

by Dave Chinner

[permalink] [raw]
Subject: Re: [RFC] [PATCH] drop_pagecache syscall

On Tue, Apr 26, 2011 at 11:35:27PM +0200, Andrea Righi wrote:
> Introduce sys_drop_pagecache() system call to drop the page cache pages of
> a single filesystem.
>
> This new system call takes a file descriptor as argument and drops only
> the page cache pages of the file system it references.
>
> At the moment it is possible to drop page cache pages via
> /proc/sys/vm/drop_pagecache or via posix_fadvise(POSIX_FADV_DONTNEED).
>
> The first method drops the whole page cache while the second can be used
> to drop page cache pages of a single file descriptor. But there's not a
> simple way to drop all the pages of a filesystem (we could scan all the
> file descriptors and use posix_fadvise(), but this solution doesn't scale
> very well in some cases).

Why not just add a new posix_fadvise() command? e.g.
POSIX_FADV_DONTNEED_FS. Simpler than adding a new syscall...

> This functionality can be used by all the applications that want to have a
> better control over the page cache management (for example to immediately drop
> pages that for sure will not be reused in the near future, without calling
> posix_fadvise() for all the files they've touched), or to provide a more fine
> grained debugging feature usable by the filesystem benchmarks.
>
> The system call does not require root privileges and it can be called by any
> unprivileged application. For example, we can write a userspace tool to run
> something like this:
>
> $ drop-pagecache /path/file_or_dir

That's a potential DOS vector, I think. Drop the pagecache in a hard
loop on the root fs of a busy server and watch it crawl...

> +/*
> + * Drop page cache of a single superblock
> + */
> +SYSCALL_DEFINE1(drop_pagecache, int, fd)
> +{
> + struct file *file;
> + struct super_block *sb;
> + int fput_needed;
> +
> + file = fget_light(fd, &fput_needed);
> + if (!file)
> + return -EBADF;
> + sb = file->f_dentry->d_sb;
> +
> + down_read(&sb->s_umount);
> + drop_pagecache_sb(sb, NULL);
> + up_read(&sb->s_umount);
> +
> + fput_light(file, fput_needed);
> + return 0;

You're holding an open reference to a file/dir on the fs so it can't
be unmounted from under you. Hence I don't think you need the
s_umount locking.

Cheers,

Dave.
--
Dave Chinner
[email protected]

2011-04-27 09:01:37

by Andrea Righi

[permalink] [raw]
Subject: Re: [RFC] [PATCH] drop_pagecache syscall

On Wed, Apr 27, 2011 at 10:14:53AM +1000, Dave Chinner wrote:
> On Tue, Apr 26, 2011 at 11:35:27PM +0200, Andrea Righi wrote:
> > Introduce sys_drop_pagecache() system call to drop the page cache pages of
> > a single filesystem.
> >
> > This new system call takes a file descriptor as argument and drops only
> > the page cache pages of the file system it references.
> >
> > At the moment it is possible to drop page cache pages via
> > /proc/sys/vm/drop_pagecache or via posix_fadvise(POSIX_FADV_DONTNEED).
> >
> > The first method drops the whole page cache while the second can be used
> > to drop page cache pages of a single file descriptor. But there's not a
> > simple way to drop all the pages of a filesystem (we could scan all the
> > file descriptors and use posix_fadvise(), but this solution doesn't scale
> > very well in some cases).
>
> Why not just add a new posix_fadvise() command? e.g.
> POSIX_FADV_DONTNEED_FS. Simpler than adding a new syscall...

Agreed.

>
> > This functionality can be used by all the applications that want to have a
> > better control over the page cache management (for example to immediately drop
> > pages that for sure will not be reused in the near future, without calling
> > posix_fadvise() for all the files they've touched), or to provide a more fine
> > grained debugging feature usable by the filesystem benchmarks.
> >
> > The system call does not require root privileges and it can be called by any
> > unprivileged application. For example, we can write a userspace tool to run
> > something like this:
> >
> > $ drop-pagecache /path/file_or_dir
>
> That's a potential DOS vector, I think. Drop the pagecache in a hard
> loop on the root fs of a busy server and watch it crawl...

Yes, probably we could allow only the CAP_SYS_ADMIN tasks to execute
this syscall.

>
> > +/*
> > + * Drop page cache of a single superblock
> > + */
> > +SYSCALL_DEFINE1(drop_pagecache, int, fd)
> > +{
> > + struct file *file;
> > + struct super_block *sb;
> > + int fput_needed;
> > +
> > + file = fget_light(fd, &fput_needed);
> > + if (!file)
> > + return -EBADF;
> > + sb = file->f_dentry->d_sb;
> > +
> > + down_read(&sb->s_umount);
> > + drop_pagecache_sb(sb, NULL);
> > + up_read(&sb->s_umount);
> > +
> > + fput_light(file, fput_needed);
> > + return 0;
>
> You're holding an open reference to a file/dir on the fs so it can't
> be unmounted from under you. Hence I don't think you need the
> s_umount locking.

Yes, you're right. The fs can't be unmounted, so I also think we can do
it without the s_umount locking.

I'll apply your suggestions, do some tests and post a new version of the
patch.

Thanks for the review.

-Andrea

2011-04-27 09:11:05

by Mike Frysinger

[permalink] [raw]
Subject: Re: [RFC] [PATCH] drop_pagecache syscall

On Wed, Apr 27, 2011 at 05:01, Andrea Righi wrote:
> On Wed, Apr 27, 2011 at 10:14:53AM +1000, Dave Chinner wrote:
>> On Tue, Apr 26, 2011 at 11:35:27PM +0200, Andrea Righi wrote:
>> > This functionality can be used by all the applications that want to have a
>> > better control over the page cache management (for example to immediately drop
>> > pages that for sure will not be reused in the near future, without calling
>> > posix_fadvise() for all the files they've touched), or to provide a more fine
>> > grained debugging feature usable by the filesystem benchmarks.
>> >
>> > The system call does not require root privileges and it can be called by any
>> > unprivileged application. For example, we can write a userspace tool to run
>> > something like this:
>> >
>> > ? $ drop-pagecache /path/file_or_dir
>>
>> That's a potential DOS vector, I think. Drop the pagecache in a hard
>> loop on the root fs of a busy server and watch it crawl...
>
> Yes, probably we could allow only the CAP_SYS_ADMIN tasks to execute
> this syscall.

if /proc/sys/vm/drop_caches has any checks other than file permission
checks (i.e. UID==0), it'd probably be better to copy those rather
than picking something different.

2011-04-27 09:47:24

by Andrea Righi

[permalink] [raw]
Subject: Re: [RFC] [PATCH] drop_pagecache syscall

On Wed, Apr 27, 2011 at 05:10:41AM -0400, Mike Frysinger wrote:
> On Wed, Apr 27, 2011 at 05:01, Andrea Righi wrote:
> > On Wed, Apr 27, 2011 at 10:14:53AM +1000, Dave Chinner wrote:
> >> On Tue, Apr 26, 2011 at 11:35:27PM +0200, Andrea Righi wrote:
> >> > This functionality can be used by all the applications that want to have a
> >> > better control over the page cache management (for example to immediately drop
> >> > pages that for sure will not be reused in the near future, without calling
> >> > posix_fadvise() for all the files they've touched), or to provide a more fine
> >> > grained debugging feature usable by the filesystem benchmarks.
> >> >
> >> > The system call does not require root privileges and it can be called by any
> >> > unprivileged application. For example, we can write a userspace tool to run
> >> > something like this:
> >> >
> >> > ? $ drop-pagecache /path/file_or_dir
> >>
> >> That's a potential DOS vector, I think. Drop the pagecache in a hard
> >> loop on the root fs of a busy server and watch it crawl...
> >
> > Yes, probably we could allow only the CAP_SYS_ADMIN tasks to execute
> > this syscall.
>
> if /proc/sys/vm/drop_caches has any checks other than file permission
> checks (i.e. UID==0), it'd probably be better to copy those rather
> than picking something different.

ok, what about checking current_euid() == 0?

Thanks,
-Andrea

2011-04-27 09:50:30

by Mike Frysinger

[permalink] [raw]
Subject: Re: [RFC] [PATCH] drop_pagecache syscall

On Wed, Apr 27, 2011 at 05:47, Andrea Righi wrote:
> On Wed, Apr 27, 2011 at 05:10:41AM -0400, Mike Frysinger wrote:
>> On Wed, Apr 27, 2011 at 05:01, Andrea Righi wrote:
>> > On Wed, Apr 27, 2011 at 10:14:53AM +1000, Dave Chinner wrote:
>> >> On Tue, Apr 26, 2011 at 11:35:27PM +0200, Andrea Righi wrote:
>> >> > This functionality can be used by all the applications that want to have a
>> >> > better control over the page cache management (for example to immediately drop
>> >> > pages that for sure will not be reused in the near future, without calling
>> >> > posix_fadvise() for all the files they've touched), or to provide a more fine
>> >> > grained debugging feature usable by the filesystem benchmarks.
>> >> >
>> >> > The system call does not require root privileges and it can be called by any
>> >> > unprivileged application. For example, we can write a userspace tool to run
>> >> > something like this:
>> >> >
>> >> > ? $ drop-pagecache /path/file_or_dir
>> >>
>> >> That's a potential DOS vector, I think. Drop the pagecache in a hard
>> >> loop on the root fs of a busy server and watch it crawl...
>> >
>> > Yes, probably we could allow only the CAP_SYS_ADMIN tasks to execute
>> > this syscall.
>>
>> if /proc/sys/vm/drop_caches has any checks other than file permission
>> checks (i.e. UID==0), it'd probably be better to copy those rather
>> than picking something different.
>
> ok, what about checking current_euid() == 0?

that's not what i meant. if the drop_caches file already has certain
cap checks/whatever in place, let's use those. if it doesnt, then
picking a cap level as you proposed makes sense.
-mike

2011-04-27 09:57:16

by Andrea Righi

[permalink] [raw]
Subject: Re: [RFC] [PATCH] drop_pagecache syscall

On Wed, Apr 27, 2011 at 05:50:04AM -0400, Mike Frysinger wrote:
> On Wed, Apr 27, 2011 at 05:47, Andrea Righi wrote:
> > On Wed, Apr 27, 2011 at 05:10:41AM -0400, Mike Frysinger wrote:
> >> On Wed, Apr 27, 2011 at 05:01, Andrea Righi wrote:
> >> > On Wed, Apr 27, 2011 at 10:14:53AM +1000, Dave Chinner wrote:
> >> >> On Tue, Apr 26, 2011 at 11:35:27PM +0200, Andrea Righi wrote:
> >> >> > This functionality can be used by all the applications that want to have a
> >> >> > better control over the page cache management (for example to immediately drop
> >> >> > pages that for sure will not be reused in the near future, without calling
> >> >> > posix_fadvise() for all the files they've touched), or to provide a more fine
> >> >> > grained debugging feature usable by the filesystem benchmarks.
> >> >> >
> >> >> > The system call does not require root privileges and it can be called by any
> >> >> > unprivileged application. For example, we can write a userspace tool to run
> >> >> > something like this:
> >> >> >
> >> >> > ? $ drop-pagecache /path/file_or_dir
> >> >>
> >> >> That's a potential DOS vector, I think. Drop the pagecache in a hard
> >> >> loop on the root fs of a busy server and watch it crawl...
> >> >
> >> > Yes, probably we could allow only the CAP_SYS_ADMIN tasks to execute
> >> > this syscall.
> >>
> >> if /proc/sys/vm/drop_caches has any checks other than file permission
> >> checks (i.e. UID==0), it'd probably be better to copy those rather
> >> than picking something different.
> >
> > ok, what about checking current_euid() == 0?
>
> that's not what i meant. if the drop_caches file already has certain
> cap checks/whatever in place, let's use those. if it doesnt, then
> picking a cap level as you proposed makes sense.

mmh, drop_caches has a file ownership (root:root) and a permission mask
(0644), how to apply the same checks to a system call? The most similar
thing seems to check the current euid. Am I missing something?

-Andrea

2011-04-27 15:26:10

by Mike Frysinger

[permalink] [raw]
Subject: Re: [RFC] [PATCH] drop_pagecache syscall

On Wed, Apr 27, 2011 at 05:57, Andrea Righi wrote:
> On Wed, Apr 27, 2011 at 05:50:04AM -0400, Mike Frysinger wrote:
>> On Wed, Apr 27, 2011 at 05:47, Andrea Righi wrote:
>> > On Wed, Apr 27, 2011 at 05:10:41AM -0400, Mike Frysinger wrote:
>> >> On Wed, Apr 27, 2011 at 05:01, Andrea Righi wrote:
>> >> > On Wed, Apr 27, 2011 at 10:14:53AM +1000, Dave Chinner wrote:
>> >> >> On Tue, Apr 26, 2011 at 11:35:27PM +0200, Andrea Righi wrote:
>> >> >> > This functionality can be used by all the applications that want to have a
>> >> >> > better control over the page cache management (for example to immediately drop
>> >> >> > pages that for sure will not be reused in the near future, without calling
>> >> >> > posix_fadvise() for all the files they've touched), or to provide a more fine
>> >> >> > grained debugging feature usable by the filesystem benchmarks.
>> >> >> >
>> >> >> > The system call does not require root privileges and it can be called by any
>> >> >> > unprivileged application. For example, we can write a userspace tool to run
>> >> >> > something like this:
>> >> >> >
>> >> >> >   $ drop-pagecache /path/file_or_dir
>> >> >>
>> >> >> That's a potential DOS vector, I think. Drop the pagecache in a hard
>> >> >> loop on the root fs of a busy server and watch it crawl...
>> >> >
>> >> > Yes, probably we could allow only the CAP_SYS_ADMIN tasks to execute
>> >> > this syscall.
>> >>
>> >> if /proc/sys/vm/drop_caches has any checks other than file permission
>> >> checks (i.e. UID==0), it'd probably be better to copy those rather
>> >> than picking something different.
>> >
>> > ok, what about checking current_euid() == 0?
>>
>> that's not what i meant.  if the drop_caches file already has certain
>> cap checks/whatever in place, let's use those.  if it doesnt, then
>> picking a cap level as you proposed makes sense.
>
> mmh, drop_caches has a file ownership (root:root) and a permission mask
> (0644), how to apply the same checks to a system call? The most similar
> thing seems to check the current euid. Am I missing something?

my (limited) understanding is that you should be using cap checks, not UID
-mike

2011-04-27 15:42:47

by Andrea Righi

[permalink] [raw]
Subject: Re: [RFC] [PATCH] drop_pagecache syscall

On Wed, Apr 27, 2011 at 11:25:47AM -0400, Mike Frysinger wrote:
> On Wed, Apr 27, 2011 at 05:57, Andrea Righi wrote:
> > On Wed, Apr 27, 2011 at 05:50:04AM -0400, Mike Frysinger wrote:
> >> On Wed, Apr 27, 2011 at 05:47, Andrea Righi wrote:
> >> > On Wed, Apr 27, 2011 at 05:10:41AM -0400, Mike Frysinger wrote:
> >> >> On Wed, Apr 27, 2011 at 05:01, Andrea Righi wrote:
> >> >> > On Wed, Apr 27, 2011 at 10:14:53AM +1000, Dave Chinner wrote:
> >> >> >> On Tue, Apr 26, 2011 at 11:35:27PM +0200, Andrea Righi wrote:
> >> >> >> > This functionality can be used by all the applications that want to have a
> >> >> >> > better control over the page cache management (for example to immediately drop
> >> >> >> > pages that for sure will not be reused in the near future, without calling
> >> >> >> > posix_fadvise() for all the files they've touched), or to provide a more fine
> >> >> >> > grained debugging feature usable by the filesystem benchmarks.
> >> >> >> >
> >> >> >> > The system call does not require root privileges and it can be called by any
> >> >> >> > unprivileged application. For example, we can write a userspace tool to run
> >> >> >> > something like this:
> >> >> >> >
> >> >> >> > ? $ drop-pagecache /path/file_or_dir
> >> >> >>
> >> >> >> That's a potential DOS vector, I think. Drop the pagecache in a hard
> >> >> >> loop on the root fs of a busy server and watch it crawl...
> >> >> >
> >> >> > Yes, probably we could allow only the CAP_SYS_ADMIN tasks to execute
> >> >> > this syscall.
> >> >>
> >> >> if /proc/sys/vm/drop_caches has any checks other than file permission
> >> >> checks (i.e. UID==0), it'd probably be better to copy those rather
> >> >> than picking something different.
> >> >
> >> > ok, what about checking current_euid() == 0?
> >>
> >> that's not what i meant. ?if the drop_caches file already has certain
> >> cap checks/whatever in place, let's use those. ?if it doesnt, then
> >> picking a cap level as you proposed makes sense.
> >
> > mmh, drop_caches has a file ownership (root:root) and a permission mask
> > (0644), how to apply the same checks to a system call? The most similar
> > thing seems to check the current euid. Am I missing something?
>
> my (limited) understanding is that you should be using cap checks, not UID
> -mike

Agreed. This was my initial proposal. It's not very good, but I also
think it's the best option for this case.

Thanks,
-Andrea

2011-04-28 23:22:26

by Joel Becker

[permalink] [raw]
Subject: Re: [RFC] [PATCH] drop_pagecache syscall

On Wed, Apr 27, 2011 at 11:01:28AM +0200, Andrea Righi wrote:
> On Wed, Apr 27, 2011 at 10:14:53AM +1000, Dave Chinner wrote:
> > On Tue, Apr 26, 2011 at 11:35:27PM +0200, Andrea Righi wrote:
> > > Introduce sys_drop_pagecache() system call to drop the page cache pages of
> > > a single filesystem.
<snip>
> > > This functionality can be used by all the applications that want to have a
> > > better control over the page cache management (for example to immediately drop
> > > pages that for sure will not be reused in the near future, without calling
> > > posix_fadvise() for all the files they've touched), or to provide a more fine
> > > grained debugging feature usable by the filesystem benchmarks.
> > >
> > > The system call does not require root privileges and it can be called by any
> > > unprivileged application. For example, we can write a userspace tool to run
> > > something like this:
> > >
> > > $ drop-pagecache /path/file_or_dir
> >
> > That's a potential DOS vector, I think. Drop the pagecache in a hard
> > loop on the root fs of a busy server and watch it crawl...
>
> Yes, probably we could allow only the CAP_SYS_ADMIN tasks to execute
> this syscall.

The majority of apps that want this do not run as root. Do we
want them all setuid? ;-)

Joel

--

"If the human brain were so simple we could understand it, we would
be so simple that we could not."
- W. A. Clouston

http://www.jlbec.org/
[email protected]

2011-04-29 08:18:39

by Andrea Righi

[permalink] [raw]
Subject: Re: [RFC] [PATCH] drop_pagecache syscall

On Thu, Apr 28, 2011 at 04:22:10PM -0700, Joel Becker wrote:
> On Wed, Apr 27, 2011 at 11:01:28AM +0200, Andrea Righi wrote:
> > On Wed, Apr 27, 2011 at 10:14:53AM +1000, Dave Chinner wrote:
> > > On Tue, Apr 26, 2011 at 11:35:27PM +0200, Andrea Righi wrote:
> > > > Introduce sys_drop_pagecache() system call to drop the page cache pages of
> > > > a single filesystem.
> <snip>
> > > > This functionality can be used by all the applications that want to have a
> > > > better control over the page cache management (for example to immediately drop
> > > > pages that for sure will not be reused in the near future, without calling
> > > > posix_fadvise() for all the files they've touched), or to provide a more fine
> > > > grained debugging feature usable by the filesystem benchmarks.
> > > >
> > > > The system call does not require root privileges and it can be called by any
> > > > unprivileged application. For example, we can write a userspace tool to run
> > > > something like this:
> > > >
> > > > $ drop-pagecache /path/file_or_dir
> > >
> > > That's a potential DOS vector, I think. Drop the pagecache in a hard
> > > loop on the root fs of a busy server and watch it crawl...
> >
> > Yes, probably we could allow only the CAP_SYS_ADMIN tasks to execute
> > this syscall.
>
> The majority of apps that want this do not run as root. Do we
> want them all setuid? ;-)

Another solution could be to limit the rate of this syscall if executed
by a non-privileged user. And the rate limit could be also configurable
from userspace via /proc/sys/vm/ or something similar.

-Andrea