2014-10-20 21:49:10

by Pieter Smith

[permalink] [raw]
Subject: [PATCH 1/2] fs: Moved sendfile syscall to own source file

Part of the tinification effort. Splitting out the sendfile syscall
allows optional compilation in the succeeding patch.

Signed-off-by: Pieter Smith <[email protected]>
---
fs/Makefile | 3 +-
fs/read_write.c | 176 -------------------------------------------------
fs/sendfile.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 201 insertions(+), 178 deletions(-)
create mode 100644 fs/sendfile.c

diff --git a/fs/Makefile b/fs/Makefile
index 90c8852..1e3423f 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -5,7 +5,7 @@
# Rewritten to use lists instead of if-statements.
#

-obj-y := open.o read_write.o file_table.o super.o \
+obj-y := open.o read_write.o sendfile.o file_table.o super.o \
char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
ioctl.o readdir.o select.o dcache.o inode.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
@@ -18,7 +18,6 @@ obj-y += buffer.o block_dev.o direct-io.o mpage.o
else
obj-y += no-block.o
endif
-
obj-$(CONFIG_PROC_FS) += proc_namespace.o

obj-y += notify/
diff --git a/fs/read_write.c b/fs/read_write.c
index 009d854..fc27a01 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1188,179 +1188,3 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
return __compat_sys_pwritev64(fd, vec, vlen, pos);
}
#endif
-
-static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
- size_t count, loff_t max)
-{
- struct fd in, out;
- struct inode *in_inode, *out_inode;
- loff_t pos;
- loff_t out_pos;
- ssize_t retval;
- int fl;
-
- /*
- * Get input file, and verify that it is ok..
- */
- retval = -EBADF;
- in = fdget(in_fd);
- if (!in.file)
- goto out;
- if (!(in.file->f_mode & FMODE_READ))
- goto fput_in;
- retval = -ESPIPE;
- if (!ppos) {
- pos = in.file->f_pos;
- } else {
- pos = *ppos;
- if (!(in.file->f_mode & FMODE_PREAD))
- goto fput_in;
- }
- retval = rw_verify_area(READ, in.file, &pos, count);
- if (retval < 0)
- goto fput_in;
- count = retval;
-
- /*
- * Get output file, and verify that it is ok..
- */
- retval = -EBADF;
- out = fdget(out_fd);
- if (!out.file)
- goto fput_in;
- if (!(out.file->f_mode & FMODE_WRITE))
- goto fput_out;
- retval = -EINVAL;
- in_inode = file_inode(in.file);
- out_inode = file_inode(out.file);
- out_pos = out.file->f_pos;
- retval = rw_verify_area(WRITE, out.file, &out_pos, count);
- if (retval < 0)
- goto fput_out;
- count = retval;
-
- if (!max)
- max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
-
- if (unlikely(pos + count > max)) {
- retval = -EOVERFLOW;
- if (pos >= max)
- goto fput_out;
- count = max - pos;
- }
-
- fl = 0;
-#if 0
- /*
- * We need to debate whether we can enable this or not. The
- * man page documents EAGAIN return for the output at least,
- * and the application is arguably buggy if it doesn't expect
- * EAGAIN on a non-blocking file descriptor.
- */
- if (in.file->f_flags & O_NONBLOCK)
- fl = SPLICE_F_NONBLOCK;
-#endif
- file_start_write(out.file);
- retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
- file_end_write(out.file);
-
- if (retval > 0) {
- add_rchar(current, retval);
- add_wchar(current, retval);
- fsnotify_access(in.file);
- fsnotify_modify(out.file);
- out.file->f_pos = out_pos;
- if (ppos)
- *ppos = pos;
- else
- in.file->f_pos = pos;
- }
-
- inc_syscr(current);
- inc_syscw(current);
- if (pos > max)
- retval = -EOVERFLOW;
-
-fput_out:
- fdput(out);
-fput_in:
- fdput(in);
-out:
- return retval;
-}
-
-SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
-{
- loff_t pos;
- off_t off;
- ssize_t ret;
-
- if (offset) {
- if (unlikely(get_user(off, offset)))
- return -EFAULT;
- pos = off;
- ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
- if (unlikely(put_user(pos, offset)))
- return -EFAULT;
- return ret;
- }
-
- return do_sendfile(out_fd, in_fd, NULL, count, 0);
-}
-
-SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
-{
- loff_t pos;
- ssize_t ret;
-
- if (offset) {
- if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
- return -EFAULT;
- ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
- if (unlikely(put_user(pos, offset)))
- return -EFAULT;
- return ret;
- }
-
- return do_sendfile(out_fd, in_fd, NULL, count, 0);
-}
-
-#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
- compat_off_t __user *, offset, compat_size_t, count)
-{
- loff_t pos;
- off_t off;
- ssize_t ret;
-
- if (offset) {
- if (unlikely(get_user(off, offset)))
- return -EFAULT;
- pos = off;
- ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
- if (unlikely(put_user(pos, offset)))
- return -EFAULT;
- return ret;
- }
-
- return do_sendfile(out_fd, in_fd, NULL, count, 0);
-}
-
-COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
- compat_loff_t __user *, offset, compat_size_t, count)
-{
- loff_t pos;
- ssize_t ret;
-
- if (offset) {
- if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
- return -EFAULT;
- ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
- if (unlikely(put_user(pos, offset)))
- return -EFAULT;
- return ret;
- }
-
- return do_sendfile(out_fd, in_fd, NULL, count, 0);
-}
-#endif
diff --git a/fs/sendfile.c b/fs/sendfile.c
new file mode 100644
index 0000000..4ceccd4
--- /dev/null
+++ b/fs/sendfile.c
@@ -0,0 +1,200 @@
+/*
+ * linux/fs/sendfile.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/aio.h>
+#include <linux/fsnotify.h>
+#include <linux/security.h>
+#include <linux/export.h>
+#include <linux/syscalls.h>
+#include <linux/pagemap.h>
+#include <linux/splice.h>
+#include <linux/compat.h>
+#include "internal.h"
+
+#include <linux/uaccess.h>
+#include <asm/unistd.h>
+
+
+static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
+ size_t count, loff_t max)
+{
+ struct fd in, out;
+ struct inode *in_inode, *out_inode;
+ loff_t pos;
+ loff_t out_pos;
+ ssize_t retval;
+ int fl;
+
+ /*
+ * Get input file, and verify that it is ok..
+ */
+ retval = -EBADF;
+ in = fdget(in_fd);
+ if (!in.file)
+ goto out;
+ if (!(in.file->f_mode & FMODE_READ))
+ goto fput_in;
+ retval = -ESPIPE;
+ if (!ppos) {
+ pos = in.file->f_pos;
+ } else {
+ pos = *ppos;
+ if (!(in.file->f_mode & FMODE_PREAD))
+ goto fput_in;
+ }
+ retval = rw_verify_area(READ, in.file, &pos, count);
+ if (retval < 0)
+ goto fput_in;
+ count = retval;
+
+ /*
+ * Get output file, and verify that it is ok..
+ */
+ retval = -EBADF;
+ out = fdget(out_fd);
+ if (!out.file)
+ goto fput_in;
+ if (!(out.file->f_mode & FMODE_WRITE))
+ goto fput_out;
+ retval = -EINVAL;
+ in_inode = file_inode(in.file);
+ out_inode = file_inode(out.file);
+ out_pos = out.file->f_pos;
+ retval = rw_verify_area(WRITE, out.file, &out_pos, count);
+ if (retval < 0)
+ goto fput_out;
+ count = retval;
+
+ if (!max)
+ max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
+
+ if (unlikely(pos + count > max)) {
+ retval = -EOVERFLOW;
+ if (pos >= max)
+ goto fput_out;
+ count = max - pos;
+ }
+
+ fl = 0;
+#if 0
+ /*
+ * We need to debate whether we can enable this or not. The
+ * man page documents EAGAIN return for the output at least,
+ * and the application is arguably buggy if it doesn't expect
+ * EAGAIN on a non-blocking file descriptor.
+ */
+ if (in.file->f_flags & O_NONBLOCK)
+ fl = SPLICE_F_NONBLOCK;
+#endif
+ file_start_write(out.file);
+ retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
+ file_end_write(out.file);
+
+ if (retval > 0) {
+ add_rchar(current, retval);
+ add_wchar(current, retval);
+ fsnotify_access(in.file);
+ fsnotify_modify(out.file);
+ out.file->f_pos = out_pos;
+ if (ppos)
+ *ppos = pos;
+ else
+ in.file->f_pos = pos;
+ }
+
+ inc_syscr(current);
+ inc_syscw(current);
+ if (pos > max)
+ retval = -EOVERFLOW;
+
+fput_out:
+ fdput(out);
+fput_in:
+ fdput(in);
+out:
+ return retval;
+}
+
+SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
+{
+ loff_t pos;
+ off_t off;
+ ssize_t ret;
+
+ if (offset) {
+ if (unlikely(get_user(off, offset)))
+ return -EFAULT;
+ pos = off;
+ ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
+ if (unlikely(put_user(pos, offset)))
+ return -EFAULT;
+ return ret;
+ }
+
+ return do_sendfile(out_fd, in_fd, NULL, count, 0);
+}
+
+SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
+{
+ loff_t pos;
+ ssize_t ret;
+
+ if (offset) {
+ if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
+ return -EFAULT;
+ ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
+ if (unlikely(put_user(pos, offset)))
+ return -EFAULT;
+ return ret;
+ }
+
+ return do_sendfile(out_fd, in_fd, NULL, count, 0);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
+ compat_off_t __user *, offset, compat_size_t, count)
+{
+ loff_t pos;
+ off_t off;
+ ssize_t ret;
+
+ if (offset) {
+ if (unlikely(get_user(off, offset)))
+ return -EFAULT;
+ pos = off;
+ ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
+ if (unlikely(put_user(pos, offset)))
+ return -EFAULT;
+ return ret;
+ }
+
+ return do_sendfile(out_fd, in_fd, NULL, count, 0);
+}
+
+COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
+ compat_loff_t __user *, offset, compat_size_t, count)
+{
+ loff_t pos;
+ ssize_t ret;
+
+ if (offset) {
+ if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
+ return -EFAULT;
+ ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
+ if (unlikely(put_user(pos, offset)))
+ return -EFAULT;
+ return ret;
+ }
+
+ return do_sendfile(out_fd, in_fd, NULL, count, 0);
+}
+#endif
--
1.9.1


2014-10-20 21:49:48

by Pieter Smith

[permalink] [raw]
Subject: [PATCH 2/2] fs: Support compiling out sendfile

Many embedded systems will not need this syscall, and omitting it
saves space. Add a new EXPERT config option CONFIG_SENDFILE_SYSCALL
(default y) to support compiling it out.

bloat-o-meter:
add/remove: 0/4 grow/shrink: 5/0 up/down: 23/-751 (-728)
function old new delta
sys_pwritev 115 122 +7
sys_preadv 115 122 +7
fdput_pos 29 36 +7
sys_pwrite64 115 116 +1
sys_pread64 115 116 +1
fdput 11 - -11
sys_sendfile 122 - -122
sys_sendfile64 126 - -126
do_sendfile 492 - -492

Signed-off-by: Pieter Smith <[email protected]>
---
fs/Makefile | 3 ++-
init/Kconfig | 10 ++++++++++
kernel/sys_ni.c | 4 ++++
3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/fs/Makefile b/fs/Makefile
index 1e3423f..1bbfea7 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -5,7 +5,7 @@
# Rewritten to use lists instead of if-statements.
#

-obj-y := open.o read_write.o sendfile.o file_table.o super.o \
+obj-y := open.o read_write.o file_table.o super.o \
char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
ioctl.o readdir.o select.o dcache.o inode.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
@@ -38,6 +38,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o
obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o
obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o
obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o
+obj-$(CONFIG_SENDFILE_SYSCALL) += sendfile.o

obj-$(CONFIG_FS_MBCACHE) += mbcache.o
obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o
diff --git a/init/Kconfig b/init/Kconfig
index 782a65b..df6785c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1547,6 +1547,16 @@ config ADVISE_SYSCALLS
applications use these syscalls, you can disable this option to save
space.

+config SENDFILE_SYSCALL
+ bool "Enable sendfile syscall" if EXPERT
+ default y
+ help
+ This option enables the sendfile syscall, used by applications to copy
+ data between file descriptors. Because sendfile performs the copying
+ within the kernel, it is more efficient than the combination of read
+ and write. If building an embedded system where no applications use
+ the sendfile syscall, you can disable this option to save space.
+
config PCI_QUIRKS
default y
bool "Enable PCI quirk workarounds" if EXPERT
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d4709d4..b068de7 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -159,6 +159,10 @@ cond_syscall(sys_uselib);
cond_syscall(sys_fadvise64);
cond_syscall(sys_fadvise64_64);
cond_syscall(sys_madvise);
+cond_syscall(sys_sendfile);
+cond_syscall(sys_sendfile64);
+cond_syscall(compat_sys_sendfile);
+cond_syscall(compat_sys_sendfile64);

/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
--
1.9.1

2014-10-20 22:04:02

by Josh Triplett

[permalink] [raw]
Subject: Re: [PATCH 1/2] fs: Moved sendfile syscall to own source file

On Mon, Oct 20, 2014 at 11:48:36PM +0200, Pieter Smith wrote:
> Part of the tinification effort. Splitting out the sendfile syscall
> allows optional compilation in the succeeding patch.
>
> Signed-off-by: Pieter Smith <[email protected]>

Not sure why this was sent twice, but in any case, the same comment
applies: one nit below, and with that fixed:
Reviewed-by: Josh Triplett <[email protected]>

I've explicitly checked that the moved code matches between
fs/read_write.c and fs/sendfile.c, modulo a single whitespace fix (in
the indentation of the continuation line for do_sendfile's definition).

> fs/Makefile | 3 +-
> fs/read_write.c | 176 -------------------------------------------------
> fs/sendfile.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 201 insertions(+), 178 deletions(-)
> create mode 100644 fs/sendfile.c
>
> diff --git a/fs/Makefile b/fs/Makefile
> index 90c8852..1e3423f 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -5,7 +5,7 @@
> # Rewritten to use lists instead of if-statements.
> #
>
> -obj-y := open.o read_write.o file_table.o super.o \
> +obj-y := open.o read_write.o sendfile.o file_table.o super.o \
> char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
> ioctl.o readdir.o select.o dcache.o inode.o \
> attr.o bad_inode.o file.o filesystems.o namespace.o \
> @@ -18,7 +18,6 @@ obj-y += buffer.o block_dev.o direct-io.o mpage.o
> else
> obj-y += no-block.o
> endif
> -

Please drop the unrelated whitespace change.

> obj-$(CONFIG_PROC_FS) += proc_namespace.o
>
> obj-y += notify/
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 009d854..fc27a01 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1188,179 +1188,3 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
> return __compat_sys_pwritev64(fd, vec, vlen, pos);
> }
> #endif
> -
> -static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
> - size_t count, loff_t max)
> -{
> - struct fd in, out;
> - struct inode *in_inode, *out_inode;
> - loff_t pos;
> - loff_t out_pos;
> - ssize_t retval;
> - int fl;
> -
> - /*
> - * Get input file, and verify that it is ok..
> - */
> - retval = -EBADF;
> - in = fdget(in_fd);
> - if (!in.file)
> - goto out;
> - if (!(in.file->f_mode & FMODE_READ))
> - goto fput_in;
> - retval = -ESPIPE;
> - if (!ppos) {
> - pos = in.file->f_pos;
> - } else {
> - pos = *ppos;
> - if (!(in.file->f_mode & FMODE_PREAD))
> - goto fput_in;
> - }
> - retval = rw_verify_area(READ, in.file, &pos, count);
> - if (retval < 0)
> - goto fput_in;
> - count = retval;
> -
> - /*
> - * Get output file, and verify that it is ok..
> - */
> - retval = -EBADF;
> - out = fdget(out_fd);
> - if (!out.file)
> - goto fput_in;
> - if (!(out.file->f_mode & FMODE_WRITE))
> - goto fput_out;
> - retval = -EINVAL;
> - in_inode = file_inode(in.file);
> - out_inode = file_inode(out.file);
> - out_pos = out.file->f_pos;
> - retval = rw_verify_area(WRITE, out.file, &out_pos, count);
> - if (retval < 0)
> - goto fput_out;
> - count = retval;
> -
> - if (!max)
> - max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
> -
> - if (unlikely(pos + count > max)) {
> - retval = -EOVERFLOW;
> - if (pos >= max)
> - goto fput_out;
> - count = max - pos;
> - }
> -
> - fl = 0;
> -#if 0
> - /*
> - * We need to debate whether we can enable this or not. The
> - * man page documents EAGAIN return for the output at least,
> - * and the application is arguably buggy if it doesn't expect
> - * EAGAIN on a non-blocking file descriptor.
> - */
> - if (in.file->f_flags & O_NONBLOCK)
> - fl = SPLICE_F_NONBLOCK;
> -#endif
> - file_start_write(out.file);
> - retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
> - file_end_write(out.file);
> -
> - if (retval > 0) {
> - add_rchar(current, retval);
> - add_wchar(current, retval);
> - fsnotify_access(in.file);
> - fsnotify_modify(out.file);
> - out.file->f_pos = out_pos;
> - if (ppos)
> - *ppos = pos;
> - else
> - in.file->f_pos = pos;
> - }
> -
> - inc_syscr(current);
> - inc_syscw(current);
> - if (pos > max)
> - retval = -EOVERFLOW;
> -
> -fput_out:
> - fdput(out);
> -fput_in:
> - fdput(in);
> -out:
> - return retval;
> -}
> -
> -SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
> -{
> - loff_t pos;
> - off_t off;
> - ssize_t ret;
> -
> - if (offset) {
> - if (unlikely(get_user(off, offset)))
> - return -EFAULT;
> - pos = off;
> - ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
> - if (unlikely(put_user(pos, offset)))
> - return -EFAULT;
> - return ret;
> - }
> -
> - return do_sendfile(out_fd, in_fd, NULL, count, 0);
> -}
> -
> -SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
> -{
> - loff_t pos;
> - ssize_t ret;
> -
> - if (offset) {
> - if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
> - return -EFAULT;
> - ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
> - if (unlikely(put_user(pos, offset)))
> - return -EFAULT;
> - return ret;
> - }
> -
> - return do_sendfile(out_fd, in_fd, NULL, count, 0);
> -}
> -
> -#ifdef CONFIG_COMPAT
> -COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
> - compat_off_t __user *, offset, compat_size_t, count)
> -{
> - loff_t pos;
> - off_t off;
> - ssize_t ret;
> -
> - if (offset) {
> - if (unlikely(get_user(off, offset)))
> - return -EFAULT;
> - pos = off;
> - ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
> - if (unlikely(put_user(pos, offset)))
> - return -EFAULT;
> - return ret;
> - }
> -
> - return do_sendfile(out_fd, in_fd, NULL, count, 0);
> -}
> -
> -COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
> - compat_loff_t __user *, offset, compat_size_t, count)
> -{
> - loff_t pos;
> - ssize_t ret;
> -
> - if (offset) {
> - if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
> - return -EFAULT;
> - ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
> - if (unlikely(put_user(pos, offset)))
> - return -EFAULT;
> - return ret;
> - }
> -
> - return do_sendfile(out_fd, in_fd, NULL, count, 0);
> -}
> -#endif
> diff --git a/fs/sendfile.c b/fs/sendfile.c
> new file mode 100644
> index 0000000..4ceccd4
> --- /dev/null
> +++ b/fs/sendfile.c
> @@ -0,0 +1,200 @@
> +/*
> + * linux/fs/sendfile.c
> + *
> + * Copyright (C) 1991, 1992 Linus Torvalds
> + */
> +
> +#include <linux/slab.h>
> +#include <linux/stat.h>
> +#include <linux/fcntl.h>
> +#include <linux/file.h>
> +#include <linux/uio.h>
> +#include <linux/aio.h>
> +#include <linux/fsnotify.h>
> +#include <linux/security.h>
> +#include <linux/export.h>
> +#include <linux/syscalls.h>
> +#include <linux/pagemap.h>
> +#include <linux/splice.h>
> +#include <linux/compat.h>
> +#include "internal.h"
> +
> +#include <linux/uaccess.h>
> +#include <asm/unistd.h>
> +
> +
> +static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
> + size_t count, loff_t max)
> +{
> + struct fd in, out;
> + struct inode *in_inode, *out_inode;
> + loff_t pos;
> + loff_t out_pos;
> + ssize_t retval;
> + int fl;
> +
> + /*
> + * Get input file, and verify that it is ok..
> + */
> + retval = -EBADF;
> + in = fdget(in_fd);
> + if (!in.file)
> + goto out;
> + if (!(in.file->f_mode & FMODE_READ))
> + goto fput_in;
> + retval = -ESPIPE;
> + if (!ppos) {
> + pos = in.file->f_pos;
> + } else {
> + pos = *ppos;
> + if (!(in.file->f_mode & FMODE_PREAD))
> + goto fput_in;
> + }
> + retval = rw_verify_area(READ, in.file, &pos, count);
> + if (retval < 0)
> + goto fput_in;
> + count = retval;
> +
> + /*
> + * Get output file, and verify that it is ok..
> + */
> + retval = -EBADF;
> + out = fdget(out_fd);
> + if (!out.file)
> + goto fput_in;
> + if (!(out.file->f_mode & FMODE_WRITE))
> + goto fput_out;
> + retval = -EINVAL;
> + in_inode = file_inode(in.file);
> + out_inode = file_inode(out.file);
> + out_pos = out.file->f_pos;
> + retval = rw_verify_area(WRITE, out.file, &out_pos, count);
> + if (retval < 0)
> + goto fput_out;
> + count = retval;
> +
> + if (!max)
> + max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
> +
> + if (unlikely(pos + count > max)) {
> + retval = -EOVERFLOW;
> + if (pos >= max)
> + goto fput_out;
> + count = max - pos;
> + }
> +
> + fl = 0;
> +#if 0
> + /*
> + * We need to debate whether we can enable this or not. The
> + * man page documents EAGAIN return for the output at least,
> + * and the application is arguably buggy if it doesn't expect
> + * EAGAIN on a non-blocking file descriptor.
> + */
> + if (in.file->f_flags & O_NONBLOCK)
> + fl = SPLICE_F_NONBLOCK;
> +#endif
> + file_start_write(out.file);
> + retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
> + file_end_write(out.file);
> +
> + if (retval > 0) {
> + add_rchar(current, retval);
> + add_wchar(current, retval);
> + fsnotify_access(in.file);
> + fsnotify_modify(out.file);
> + out.file->f_pos = out_pos;
> + if (ppos)
> + *ppos = pos;
> + else
> + in.file->f_pos = pos;
> + }
> +
> + inc_syscr(current);
> + inc_syscw(current);
> + if (pos > max)
> + retval = -EOVERFLOW;
> +
> +fput_out:
> + fdput(out);
> +fput_in:
> + fdput(in);
> +out:
> + return retval;
> +}
> +
> +SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
> +{
> + loff_t pos;
> + off_t off;
> + ssize_t ret;
> +
> + if (offset) {
> + if (unlikely(get_user(off, offset)))
> + return -EFAULT;
> + pos = off;
> + ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
> + if (unlikely(put_user(pos, offset)))
> + return -EFAULT;
> + return ret;
> + }
> +
> + return do_sendfile(out_fd, in_fd, NULL, count, 0);
> +}
> +
> +SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
> +{
> + loff_t pos;
> + ssize_t ret;
> +
> + if (offset) {
> + if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
> + return -EFAULT;
> + ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
> + if (unlikely(put_user(pos, offset)))
> + return -EFAULT;
> + return ret;
> + }
> +
> + return do_sendfile(out_fd, in_fd, NULL, count, 0);
> +}
> +
> +#ifdef CONFIG_COMPAT
> +COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
> + compat_off_t __user *, offset, compat_size_t, count)
> +{
> + loff_t pos;
> + off_t off;
> + ssize_t ret;
> +
> + if (offset) {
> + if (unlikely(get_user(off, offset)))
> + return -EFAULT;
> + pos = off;
> + ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
> + if (unlikely(put_user(pos, offset)))
> + return -EFAULT;
> + return ret;
> + }
> +
> + return do_sendfile(out_fd, in_fd, NULL, count, 0);
> +}
> +
> +COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
> + compat_loff_t __user *, offset, compat_size_t, count)
> +{
> + loff_t pos;
> + ssize_t ret;
> +
> + if (offset) {
> + if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
> + return -EFAULT;
> + ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
> + if (unlikely(put_user(pos, offset)))
> + return -EFAULT;
> + return ret;
> + }
> +
> + return do_sendfile(out_fd, in_fd, NULL, count, 0);
> +}
> +#endif
> --
> 1.9.1
>

2014-10-20 22:24:33

by Josh Triplett

[permalink] [raw]
Subject: Re: [PATCH 2/2] fs: Support compiling out sendfile

On Mon, Oct 20, 2014 at 11:48:37PM +0200, Pieter Smith wrote:
> Many embedded systems will not need this syscall, and omitting it
> saves space. Add a new EXPERT config option CONFIG_SENDFILE_SYSCALL
> (default y) to support compiling it out.

Nice work, thanks!

If there are no objections, and nobody has a tree they'd rather carry
this through, I'll take the series through the tiny tree when it's ready
to merge.

> bloat-o-meter:
> add/remove: 0/4 grow/shrink: 5/0 up/down: 23/-751 (-728)
> function old new delta
> sys_pwritev 115 122 +7
> sys_preadv 115 122 +7
> fdput_pos 29 36 +7
> sys_pwrite64 115 116 +1
> sys_pread64 115 116 +1
> fdput 11 - -11
> sys_sendfile 122 - -122
> sys_sendfile64 126 - -126
> do_sendfile 492 - -492

Interesting inlining decisions by GCC here. Got a bloat-o-meter for the
two-patch series, by any chance? (Also, is this with tinyconfig? In
particular, with OPTIMIZE_INLINING and OPTIMIZE_FOR_SIZE?) I'm
wondering if moving sendfile to a separate file made GCC put fdput
out-of-line, and compiling it out reversed that again.

> Signed-off-by: Pieter Smith <[email protected]>

Reviewed-by: Josh Triplett <[email protected]>

> ---
> fs/Makefile | 3 ++-
> init/Kconfig | 10 ++++++++++
> kernel/sys_ni.c | 4 ++++
> 3 files changed, 16 insertions(+), 1 deletion(-)
>
> diff --git a/fs/Makefile b/fs/Makefile
> index 1e3423f..1bbfea7 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -5,7 +5,7 @@
> # Rewritten to use lists instead of if-statements.
> #
>
> -obj-y := open.o read_write.o sendfile.o file_table.o super.o \
> +obj-y := open.o read_write.o file_table.o super.o \
> char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
> ioctl.o readdir.o select.o dcache.o inode.o \
> attr.o bad_inode.o file.o filesystems.o namespace.o \
> @@ -38,6 +38,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o
> obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o
> obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o
> obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o
> +obj-$(CONFIG_SENDFILE_SYSCALL) += sendfile.o
>
> obj-$(CONFIG_FS_MBCACHE) += mbcache.o
> obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o
> diff --git a/init/Kconfig b/init/Kconfig
> index 782a65b..df6785c 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1547,6 +1547,16 @@ config ADVISE_SYSCALLS
> applications use these syscalls, you can disable this option to save
> space.
>
> +config SENDFILE_SYSCALL
> + bool "Enable sendfile syscall" if EXPERT
> + default y
> + help
> + This option enables the sendfile syscall, used by applications to copy
> + data between file descriptors. Because sendfile performs the copying
> + within the kernel, it is more efficient than the combination of read
> + and write. If building an embedded system where no applications use
> + the sendfile syscall, you can disable this option to save space.
> +

I'm thinking of adding a submenu to group config FOO_SYSCALL options. :)
I'll probably push that as part of the 3.19 merge window, as a patch on
top of all of the individual tinification options.

> config PCI_QUIRKS
> default y
> bool "Enable PCI quirk workarounds" if EXPERT
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index d4709d4..b068de7 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -159,6 +159,10 @@ cond_syscall(sys_uselib);
> cond_syscall(sys_fadvise64);
> cond_syscall(sys_fadvise64_64);
> cond_syscall(sys_madvise);
> +cond_syscall(sys_sendfile);
> +cond_syscall(sys_sendfile64);
> +cond_syscall(compat_sys_sendfile);
> +cond_syscall(compat_sys_sendfile64);
>
> /* arch-specific weak syscall entries */
> cond_syscall(sys_pciconfig_read);
> --
> 1.9.1
>

2014-10-21 07:52:34

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 2/2] fs: Support compiling out sendfile

On Mon, Oct 20, 2014 at 03:24:22PM -0700, [email protected] wrote:
> On Mon, Oct 20, 2014 at 11:48:37PM +0200, Pieter Smith wrote:
> > Many embedded systems will not need this syscall, and omitting it
> > saves space. Add a new EXPERT config option CONFIG_SENDFILE_SYSCALL
> > (default y) to support compiling it out.
>
> Nice work, thanks!
>
> If there are no objections, and nobody has a tree they'd rather carry
> this through, I'll take the series through the tiny tree when it's ready
> to merge.

I think it's rather pointless - there is very little sendfile code,
so you'd rather want to disable splice.

2014-10-21 09:04:49

by Josh Triplett

[permalink] [raw]
Subject: Re: [PATCH 2/2] fs: Support compiling out sendfile

On Tue, Oct 21, 2014 at 12:51:54AM -0700, Christoph Hellwig wrote:
> On Mon, Oct 20, 2014 at 03:24:22PM -0700, [email protected] wrote:
> > On Mon, Oct 20, 2014 at 11:48:37PM +0200, Pieter Smith wrote:
> > > Many embedded systems will not need this syscall, and omitting it
> > > saves space. Add a new EXPERT config option CONFIG_SENDFILE_SYSCALL
> > > (default y) to support compiling it out.
> >
> > Nice work, thanks!
> >
> > If there are no objections, and nobody has a tree they'd rather carry
> > this through, I'll take the series through the tiny tree when it's ready
> > to merge.
>
> I think it's rather pointless - there is very little sendfile code,
> so you'd rather want to disable splice.

That's the plan, but since sendfile depends on some of the splice bits,
sendfile needs to be optional as well; SENDFILE_SYSCALL will then select
SPLICE_SYSCALLS.

- Josh Triplett

2014-10-21 09:14:40

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 2/2] fs: Support compiling out sendfile

On Tue, Oct 21, 2014 at 02:04:22AM -0700, Josh Triplett wrote:
> That's the plan, but since sendfile depends on some of the splice bits,
> sendfile needs to be optional as well; SENDFILE_SYSCALL will then select
> SPLICE_SYSCALLS.

Just include sendfile with the splice syscalls - we don't really need a
config option for every obscure syscall.

2014-10-21 09:51:07

by Josh Triplett

[permalink] [raw]
Subject: Re: [PATCH 2/2] fs: Support compiling out sendfile

On Tue, Oct 21, 2014 at 02:13:56AM -0700, Christoph Hellwig wrote:
> On Tue, Oct 21, 2014 at 02:04:22AM -0700, Josh Triplett wrote:
> > That's the plan, but since sendfile depends on some of the splice bits,
> > sendfile needs to be optional as well; SENDFILE_SYSCALL will then select
> > SPLICE_SYSCALLS.
>
> Just include sendfile with the splice syscalls - we don't really need a
> config option for every obscure syscall.

No objection here. Pieter, since you're planning to remove splice
anyway, can you just fold the two together under the same Kconfig
option? That should simplify the patch series, since you won't need to
split the two.

- Josh Triplett

2014-10-21 18:22:47

by Eric Paris

[permalink] [raw]
Subject: Re: [PATCH 2/2] fs: Support compiling out sendfile

On Tue, 2014-10-21 at 10:18 -0700, [email protected] wrote:
> On Tue, Oct 21, 2014 at 08:37:00AM -0700, H. Peter Anvin wrote:
> > On 10/20/2014 02:48 PM, Pieter Smith wrote:
> > > Many embedded systems will not need this syscall, and omitting it
> > > saves space. Add a new EXPERT config option CONFIG_SENDFILE_SYSCALL
> > > (default y) to support compiling it out.
> >
> > <bikeshed>
> > I believe these options ought to be CONFIG_SYSCALL_*
> > </bikeshed>
>
> I agree. I think people started using CONFIG_*_SYSCALL because of
> things like AUDITSYSCALL

AUDITSYSCALL audits syscalls. It doesn't actually implement any
syscalls. You are right about SYSFS_SYSCALL though...