2008-12-12 14:01:50

by Gerd Hoffmann

[permalink] [raw]
Subject: [PATCH v2] Add preadv and pwritev system calls.

This patch adds preadv and pwritev system calls. These syscalls are a
pretty straightforward combination of pread and readv (same for write).
They are quite useful for doing vectored I/O in threaded applications.
Using lseek+readv instead opens race windows you'll have to plug with
locking.

Other systems have such system calls too, for example NetBSD, check
here: http://www.daemon-systems.org/man/preadv.2.html

The patch sports the actual system call implementation and the windup in
the x86 system call tables. Other archs are TBD.

Signed-off-by: Gerd Hoffmann <[email protected]>
---
arch/x86/ia32/ia32entry.S | 2 +
arch/x86/include/asm/unistd_32.h | 2 +
arch/x86/include/asm/unistd_64.h | 4 ++
arch/x86/kernel/syscall_table_32.S | 2 +
fs/compat.c | 61 ++++++++++++++++++++++++++++++++++++
fs/read_write.c | 48 ++++++++++++++++++++++++++++
6 files changed, 119 insertions(+), 0 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 256b00b..9a8501b 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -826,4 +826,6 @@ ia32_sys_call_table:
.quad sys_dup3 /* 330 */
.quad sys_pipe2
.quad sys_inotify_init1
+ .quad compat_sys_preadv
+ .quad compat_sys_pwritev
ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index f2bba78..6e72d74 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,8 @@
#define __NR_dup3 330
#define __NR_pipe2 331
#define __NR_inotify_init1 332
+#define __NR_preadv 333
+#define __NR_pwritev 334

#ifdef __KERNEL__

diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index d2e415e..f818294 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,6 +653,10 @@ __SYSCALL(__NR_dup3, sys_dup3)
__SYSCALL(__NR_pipe2, sys_pipe2)
#define __NR_inotify_init1 294
__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
+#define __NR_preadv 295
+__SYSCALL(__NR_preadv, sys_preadv)
+#define __NR_pwritev 296
+__SYSCALL(__NR_pwritev, sys_pwritev)


#ifndef __NO_STUBS
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d44395f..a1a5506 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,5 @@ ENTRY(sys_call_table)
.long sys_dup3 /* 330 */
.long sys_pipe2
.long sys_inotify_init1
+ .long sys_preadv
+ .long sys_pwritev
diff --git a/fs/compat.c b/fs/compat.c
index e5f49f5..3a25cf3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1214,6 +1214,67 @@ out:
return ret;
}

+asmlinkage ssize_t
+compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
+ unsigned long vlen, loff_t pos)
+{
+ struct file *file;
+ ssize_t ret = -EBADF;
+
+ if (pos < 0)
+ return -EINVAL;
+
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+
+ if (!(file->f_mode & FMODE_READ))
+ goto out;
+
+ ret = -EINVAL;
+ if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
+ goto out;
+
+ ret = compat_do_readv_writev(READ, file, vec, vlen, &pos);
+
+out:
+ if (ret > 0)
+ add_rchar(current, ret);
+ inc_syscr(current);
+ fput(file);
+ return ret;
+}
+
+asmlinkage ssize_t
+compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
+ unsigned long vlen, loff_t pos)
+{
+ struct file *file;
+ ssize_t ret = -EBADF;
+
+ if (pos < 0)
+ return -EINVAL;
+
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+ if (!(file->f_mode & FMODE_WRITE))
+ goto out;
+
+ ret = -EINVAL;
+ if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
+ goto out;
+
+ ret = compat_do_readv_writev(WRITE, file, vec, vlen, &pos);
+
+out:
+ if (ret > 0)
+ add_wchar(current, ret);
+ inc_syscw(current);
+ fput(file);
+ return ret;
+}
+
asmlinkage long
compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,
unsigned int nr_segs, unsigned int flags)
diff --git a/fs/read_write.c b/fs/read_write.c
index 969a6d9..89f273d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -701,6 +701,54 @@ sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
return ret;
}

+asmlinkage ssize_t sys_preadv(unsigned long fd, const struct iovec __user *vec,
+ unsigned long vlen, loff_t pos)
+{
+ struct file *file;
+ ssize_t ret = -EBADF;
+ int fput_needed;
+
+ if (pos < 0)
+ return -EINVAL;
+
+ file = fget_light(fd, &fput_needed);
+ if (file) {
+ ret = -ESPIPE;
+ if (file->f_mode & FMODE_PREAD)
+ ret = vfs_readv(file, vec, vlen, &pos);
+ fput_light(file, fput_needed);
+ }
+
+ if (ret > 0)
+ add_rchar(current, ret);
+ inc_syscr(current);
+ return ret;
+}
+
+asmlinkage ssize_t sys_pwritev(unsigned long fd, const struct iovec __user *vec,
+ unsigned long vlen, loff_t pos)
+{
+ struct file *file;
+ ssize_t ret = -EBADF;
+ int fput_needed;
+
+ if (pos < 0)
+ return -EINVAL;
+
+ file = fget_light(fd, &fput_needed);
+ if (file) {
+ ret = -ESPIPE;
+ if (file->f_mode & FMODE_PWRITE)
+ ret = vfs_writev(file, vec, vlen, &pos);
+ fput_light(file, fput_needed);
+ }
+
+ if (ret > 0)
+ add_wchar(current, ret);
+ inc_syscw(current);
+ return ret;
+}
+
static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
size_t count, loff_t max)
{
--
1.5.6.5


2008-12-12 15:29:42

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

On Fri, Dec 12, 2008 at 03:00:40PM +0100, Gerd Hoffmann wrote:
> The patch sports the actual system call implementation and the windup in
> the x86 system call tables. Other archs are TBD.

> +asmlinkage ssize_t sys_preadv(unsigned long fd, const struct iovec __user *vec,
> + unsigned long vlen, loff_t pos)
> +asmlinkage ssize_t sys_pwritev(unsigned long fd, const struct iovec __user *vec,
> + unsigned long vlen, loff_t pos)

Are these prototypes required? MIPS and PARISC will need wrappers to
fix them if they are. These two architectures have an ABI which
requires 64-bit arguments to be passed in aligned pairs of registers,
but glibc doesn't know that (and given the existence of syscall(3),
can't do much about it even if it knew), so some of the arguments end up
in the wrong registers.

Things will go much better if we can prototype these as:

asmlinkage ssize_t sys_preadv(unsigned int fd, const struct iovec __user *vec,
loff_t pos, unsigned long vlen);
asmlinkage ssize_t sys_pwritev(unsigned int fd, const struct iovec __user *vec,
loff_t pos, unsigned long vlen);

That way 'pos' ends up split between arg2 and arg3 and vlen ends up in
arg4 instead of having vlen in arg2 and pos in arg3 and arg4 which then
have to be munged to be in arg4 and arg5 by a compat wrapper.

I seem to recall the s390 folks having some concerns with this kind of
thing too, but I forget what they are, so I'll let them weigh in on
this.

By the way, why did you make 'fd' an unsigned long? The rest of the
kernel uses unsigned int.

--
Matthew Wilcox Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."

2008-12-12 15:43:12

by Ralf Baechle

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

On Fri, Dec 12, 2008 at 03:00:40PM +0100, Gerd Hoffmann wrote:

> This patch adds preadv and pwritev system calls. These syscalls are a
> pretty straightforward combination of pread and readv (same for write).
> They are quite useful for doing vectored I/O in threaded applications.
> Using lseek+readv instead opens race windows you'll have to plug with
> locking.
>
> Other systems have such system calls too, for example NetBSD, check
> here: http://www.daemon-systems.org/man/preadv.2.html
>
> The patch sports the actual system call implementation and the windup in
> the x86 system call tables. Other archs are TBD.

> +asmlinkage ssize_t sys_preadv(unsigned long fd, const struct iovec __user *vec,
> + unsigned long vlen, loff_t pos)

> +asmlinkage ssize_t sys_pwritev(unsigned long fd, const struct iovec __user *vec,
> + unsigned long vlen, loff_t pos)

As so often before the devil is in the function prototype. On some
architectures - 32-bit MIPS and PARISC, maybe others - 64-bit arguments
such as loff_t need to be passed in an _aligned_ pair of 32-bit
arguments which effectivly requires another wrapper like this around
your compat wrapper:

asmlinkage int sys32_preadv(unsigned long fd,
const struct compat_iovec __user *vec,
unsigned long vlen, int dummy, unsigned a5, unsigned a6)
{
return compat_sys_preadv(fd, vec, vlen, merge_64(a5, a6));
}

merge_64() takes two 32-bit halves of a 64-bit argument and combines them
into a 64-bit argument again.

I wonder, does that merging happen magically on x86 or?

Ralf

2008-12-12 15:49:05

by Gerd Hoffmann

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

Matthew Wilcox wrote:
>> +asmlinkage ssize_t sys_preadv(unsigned long fd, const struct iovec __user *vec,
>> + unsigned long vlen, loff_t pos)
>> +asmlinkage ssize_t sys_pwritev(unsigned long fd, const struct iovec __user *vec,
>> + unsigned long vlen, loff_t pos)
>
> Are these prototypes required? MIPS and PARISC will need wrappers to
> fix them if they are. These two architectures have an ABI which
> requires 64-bit arguments to be passed in aligned pairs of registers,
> but glibc doesn't know that (and given the existence of syscall(3),
> can't do much about it even if it knew), so some of the arguments end up
> in the wrong registers.
>
> Things will go much better if we can prototype these as:
>
> asmlinkage ssize_t sys_preadv(unsigned int fd,
> const struct iovec __user *vec,
> loff_t pos, unsigned long vlen);
> asmlinkage ssize_t sys_pwritev(unsigned int fd,
> const struct iovec __user *vec,
> loff_t pos, unsigned long vlen);

Hmm. It is the argument ordering used by NetBSD, thats why I used that
too. It certainly should be the application-visible ordering. We'll
have glibc between apps and kernel though, so I think we can reorder the
arguments at syscall level if that helps on these archs. Cc'ing Ulrich
Drepper for comments on that.

> By the way, why did you make 'fd' an unsigned long? The rest of the
> kernel uses unsigned int.

sys_{readv,writev} have unsigned long too, this is where I got it from.
Don't know what the reason for this is, it looks a bit odd indeed.

cheers,
Gerd

2008-12-12 15:51:29

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

On Fri, Dec 12, 2008 at 04:48:36PM +0100, Gerd Hoffmann wrote:
> Hmm. It is the argument ordering used by NetBSD, thats why I used that
> too. It certainly should be the application-visible ordering. We'll
> have glibc between apps and kernel though, so I think we can reorder the
> arguments at syscall level if that helps on these archs. Cc'ing Ulrich
> Drepper for comments on that.

On the other hand, NetBSD have approximately 0% market share.
We shouldn't let them lock us into making a bad decision. Is there
anyone other than NetBSD who has added these syscalls?

--
Matthew Wilcox Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."

2008-12-12 16:02:30

by Gerd Hoffmann

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

Matthew Wilcox wrote:
> On the other hand, NetBSD have approximately 0% market share.
> We shouldn't let them lock us into making a bad decision. Is there
> anyone other than NetBSD who has added these syscalls?

Free- and OpenBSD have it too. For Solaris I've found a feature request
only. Dunno about MacOS/Darwin. Other un*xes which are important these
days?

I'd *really* hate it to have the same system call with different
argument ordering on different systems though. Especially when swapping
two integer values, so gcc wouldn't error out on wrong usage.

cheers,
Gerd

2008-12-12 17:00:29

by Russell King

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

On Fri, Dec 12, 2008 at 03:00:40PM +0100, Gerd Hoffmann wrote:
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 969a6d9..89f273d 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -701,6 +701,54 @@ sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
> return ret;
> }
>
> +asmlinkage ssize_t sys_preadv(unsigned long fd, const struct iovec __user *vec,
> + unsigned long vlen, loff_t pos)

For ARM, I'd prefer this to be:

unsigned long fd, const struct iovec __user *vec,
loff_t pos, unsigned long vlen

to avoid any variance in ABIness which will occur with the argument
layout as it currently stands, but that creates a silly argument
order. The other option would be for us to define our own version
in an ARM ABI independent manner:

ssize_t sys_arm_preadv(unsigned long fd, const struct iovec __user *vec,
unsigned long vlen, unsigned long low_pos, unsigned long high_pos)

--
Russell King
Linux kernel 2.6 ARM Linux - http://www.arm.linux.org.uk/
maintainer of:

2008-12-12 17:03:58

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

On Fri, Dec 12, 2008 at 05:02:05PM +0100, Gerd Hoffmann wrote:
> I'd *really* hate it to have the same system call with different
> argument ordering on different systems though. Especially when swapping
> two integer values, so gcc wouldn't error out on wrong usage.

We can always permute it further:

int fd, int vlen, loff_t pos, const struct *

--
Matthew Wilcox Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."

2008-12-12 18:22:20

by Alan

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

On Fri, 12 Dec 2008 10:03:47 -0700
Matthew Wilcox <[email protected]> wrote:

> On Fri, Dec 12, 2008 at 05:02:05PM +0100, Gerd Hoffmann wrote:
> > I'd *really* hate it to have the same system call with different
> > argument ordering on different systems though. Especially when swapping
> > two integer values, so gcc wouldn't error out on wrong usage.
>
> We can always permute it further:
>
> int fd, int vlen, loff_t pos, const struct *

Or you could add cobol calling syntax or pass the arguments in XML
format ?

Any particular reason you want to make things hell for programmers and
the standard people. Follow the BSD one at least to user space. Anything
else will just lead to pain and suffering later on the standardisation
front.

Alan

2008-12-12 18:44:49

by Scott Lurndal

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

On Fri, Dec 12, 2008 at 05:02:05PM +0100, Gerd Hoffmann wrote:
> Matthew Wilcox wrote:
> > On the other hand, NetBSD have approximately 0% market share.
> > We shouldn't let them lock us into making a bad decision. Is there
> > anyone other than NetBSD who has added these syscalls?
>
> Free- and OpenBSD have it too. For Solaris I've found a feature request
> only. Dunno about MacOS/Darwin. Other un*xes which are important these
> days?
>
> I'd *really* hate it to have the same system call with different
> argument ordering on different systems though. Especially when swapping
> two integer values, so gcc wouldn't error out on wrong usage.

I would suggest that from the end-users perspective, the user-mode API
should be similar to pread/pwrite, e.g:

int preadv(fd, iovec, iovec_size, offset)

scott

2008-12-12 19:11:49

by Russell King

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

On Fri, Dec 12, 2008 at 06:21:42PM +0000, Alan Cox wrote:
> On Fri, 12 Dec 2008 10:03:47 -0700
> Matthew Wilcox <[email protected]> wrote:
>
> > On Fri, Dec 12, 2008 at 05:02:05PM +0100, Gerd Hoffmann wrote:
> > > I'd *really* hate it to have the same system call with different
> > > argument ordering on different systems though. Especially when swapping
> > > two integer values, so gcc wouldn't error out on wrong usage.
> >
> > We can always permute it further:
> >
> > int fd, int vlen, loff_t pos, const struct *
>
> Or you could add cobol calling syntax or pass the arguments in XML
> format ?
>
> Any particular reason you want to make things hell for programmers and
> the standard people. Follow the BSD one at least to user space. Anything
> else will just lead to pain and suffering later on the standardisation
> front.

Yes - non-aligned 64-bit arguments in registers really really really
sucks. On ARM, we will change the syscall argument ordering to avoid
having additional compatibility implementations for EABI vs OABI.

The same goes for similar ABIs with restrictions on 64-bit arguments.

There are causes were a syscall just can not be used on such ABIs. Eg,

int a1, unsigned long long a2, int a3, unsigned long long a4

is an impossible syscall argument order for ARM - we run out of registers
for the upper word of 'a4'.

--
Russell King
Linux kernel 2.6 ARM Linux - http://www.arm.linux.org.uk/
maintainer of:

2008-12-12 19:13:19

by Russell King

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

On Fri, Dec 12, 2008 at 10:29:29AM -0800, Scott Lurndal wrote:
> On Fri, Dec 12, 2008 at 05:02:05PM +0100, Gerd Hoffmann wrote:
> > Matthew Wilcox wrote:
> > > On the other hand, NetBSD have approximately 0% market share.
> > > We shouldn't let them lock us into making a bad decision. Is there
> > > anyone other than NetBSD who has added these syscalls?
> >
> > Free- and OpenBSD have it too. For Solaris I've found a feature request
> > only. Dunno about MacOS/Darwin. Other un*xes which are important these
> > days?
> >
> > I'd *really* hate it to have the same system call with different
> > argument ordering on different systems though. Especially when swapping
> > two integer values, so gcc wouldn't error out on wrong usage.
>
> I would suggest that from the end-users perspective, the user-mode API
> should be similar to pread/pwrite, e.g:
>
> int preadv(fd, iovec, iovec_size, offset)

Yes, and that's easy for glibc to achieve.

What's hard is that the user <-> kernel API firstly has a limited number
of registers available to it for passing arguments without indirection
from user space into kernel space.

Secondly, the user <-> kernel argument register allocation can vary
depending on the ABI version which user space or kernel space is built
for. On ARM we have two ABIs, one where 64-bit arguments can be placed
in any two consecutive registers, and one where 64-bit arguments must
be placed in an even,odd register pair (not an odd,even pair.)

That leads to the above being:

fd r0 r0
iovec r1 r1
vecsz r2 r2
offset r3,r4 r4,r5

Notice the different register allocation for the 64-bit offset.

This problem of register-aligned argument placement is not limited
to just ARM, but several other Linux supported architectures.

--
Russell King
Linux kernel 2.6 ARM Linux - http://www.arm.linux.org.uk/
maintainer of:

2008-12-12 19:49:20

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

On Friday 12 December 2008, Matthew Wilcox wrote:
> Things will go much better if we can prototype these as:
>
> asmlinkage ssize_t sys_preadv(unsigned int fd, const struct iovec __user *vec,
> ????????????????????????????????loff_t pos, unsigned long vlen);
> asmlinkage ssize_t sys_pwritev(unsigned int fd, const struct iovec __user *vec,
> ????????????????????????????????loff_t pos, unsigned long vlen);

I would vote for doing it the same way as sys_llseek, which avoids
this issue entirely by passing the upper half of pos sepearately:


asmlinkage ssize_t sys_preadv(unsigned long fd,
const struct iovec __user *vec,
unsigned long vlen,
unsigned long pos_high, unsigned long pos_low);
asmlinkage ssize_t sys_pwritev(unsigned long fd,
const struct iovec __user *vec,
unsigned long vlen,
unsigned long pos_high, unsigned long pos_low);

This is the only way I can see that lets us use a shared
compat_sys_preadv/pwritev across all 64 bit architectures.

The libc can then add a trivial wrapper around the syscalls
to get the regular calling conventions.

Aside from that, have you considered doing something even more flexible,
like this?

struct piovec {
void __user *iov_base;
__kernel_size_t iov_len;
__kernel_loff_t pos;
};

asmlinkage ssize_t sys_pwritev(unsigned long fd,
const struct piovec __user *vec,
unsigned long vlen);

Arnd <><

2008-12-12 19:56:21

by Gerd Hoffmann

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

Russell King wrote:
>> should be similar to pread/pwrite, e.g:
>>
>> int preadv(fd, iovec, iovec_size, offset)
>
> Yes, and that's easy for glibc to achieve.

This hints the ABI problem exists at syscall level only. Is that
correct? So we can have

preadv(fd, vec, vlen, off)

argument ordering at app <-> glibc level and

preadv(fd, vec, off, vlen)

ordering at glibc <-> kernel (aka syscall) level and it works fine for
ARM + MIPS + PARISC?

thanks,
Gerd

2008-12-12 20:02:56

by Gerd Hoffmann

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

Arnd Bergmann wrote:
> Aside from that, have you considered doing something even more flexible,
> like this?
>
> struct piovec {
> void __user *iov_base;
> __kernel_size_t iov_len;
> __kernel_loff_t pos;
> };
>
> asmlinkage ssize_t sys_pwritev(unsigned long fd,
> const struct piovec __user *vec,
> unsigned long vlen);

There is little point in doing so because I *really* want the
user-visible API being identical to the existing ones in the *BSD
family. Anything else is just a PITA for the applications using this.

cheers,
Gerd

2008-12-12 20:13:25

by Russell King

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

On Fri, Dec 12, 2008 at 08:56:00PM +0100, Gerd Hoffmann wrote:
> Russell King wrote:
> >> should be similar to pread/pwrite, e.g:
> >>
> >> int preadv(fd, iovec, iovec_size, offset)
> >
> > Yes, and that's easy for glibc to achieve.
>
> This hints the ABI problem exists at syscall level only. Is that
> correct? So we can have
>
> preadv(fd, vec, vlen, off)
>
> argument ordering at app <-> glibc level and
>
> preadv(fd, vec, off, vlen)
>
> ordering at glibc <-> kernel (aka syscall) level and it works fine for
> ARM + MIPS + PARISC?

Fine for ARM - and yes, the user visible API should be changed from the
BSD standard. I don't think anyone in this thread was suggesting that
the user visible argument ordering should be any different from the
original.

Having it in a different order from *BSD at the libc visible interface
is just crazy from the OS portability point of view.

--
Russell King
Linux kernel 2.6 ARM Linux - http://www.arm.linux.org.uk/
maintainer of:

2008-12-12 20:39:34

by Gerd Hoffmann

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

Russell King wrote:
> On Fri, Dec 12, 2008 at 08:56:00PM +0100, Gerd Hoffmann wrote:
>> Russell King wrote:
>>>> should be similar to pread/pwrite, e.g:
>>>>
>>>> int preadv(fd, iovec, iovec_size, offset)
>>> Yes, and that's easy for glibc to achieve.
>> This hints the ABI problem exists at syscall level only. Is that
>> correct? So we can have
>>
>> preadv(fd, vec, vlen, off)
>>
>> argument ordering at app <-> glibc level and
>>
>> preadv(fd, vec, off, vlen)
>>
>> ordering at glibc <-> kernel (aka syscall) level and it works fine for
>> ARM + MIPS + PARISC?
>
> Fine for ARM

Great. I'll happily switch the ordering then. /me goes wait for acks
from the other archs and plans for a new patch revision early next week.

thanks,
Gerd

2008-12-13 01:18:26

by Michael Kerrisk

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

Gerd,

Please CC linux-api on patches that change the API!

Cheers,

Michael

On Fri, Dec 12, 2008 at 9:00 AM, Gerd Hoffmann <[email protected]> wrote:
> This patch adds preadv and pwritev system calls. These syscalls are a
> pretty straightforward combination of pread and readv (same for write).
> They are quite useful for doing vectored I/O in threaded applications.
> Using lseek+readv instead opens race windows you'll have to plug with
> locking.
>
> Other systems have such system calls too, for example NetBSD, check
> here: http://www.daemon-systems.org/man/preadv.2.html
>
> The patch sports the actual system call implementation and the windup in
> the x86 system call tables. Other archs are TBD.
>
> Signed-off-by: Gerd Hoffmann <[email protected]>
> ---
> arch/x86/ia32/ia32entry.S | 2 +
> arch/x86/include/asm/unistd_32.h | 2 +
> arch/x86/include/asm/unistd_64.h | 4 ++
> arch/x86/kernel/syscall_table_32.S | 2 +
> fs/compat.c | 61 ++++++++++++++++++++++++++++++++++++
> fs/read_write.c | 48 ++++++++++++++++++++++++++++
> 6 files changed, 119 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> index 256b00b..9a8501b 100644
> --- a/arch/x86/ia32/ia32entry.S
> +++ b/arch/x86/ia32/ia32entry.S
> @@ -826,4 +826,6 @@ ia32_sys_call_table:
> .quad sys_dup3 /* 330 */
> .quad sys_pipe2
> .quad sys_inotify_init1
> + .quad compat_sys_preadv
> + .quad compat_sys_pwritev
> ia32_syscall_end:
> diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
> index f2bba78..6e72d74 100644
> --- a/arch/x86/include/asm/unistd_32.h
> +++ b/arch/x86/include/asm/unistd_32.h
> @@ -338,6 +338,8 @@
> #define __NR_dup3 330
> #define __NR_pipe2 331
> #define __NR_inotify_init1 332
> +#define __NR_preadv 333
> +#define __NR_pwritev 334
>
> #ifdef __KERNEL__
>
> diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
> index d2e415e..f818294 100644
> --- a/arch/x86/include/asm/unistd_64.h
> +++ b/arch/x86/include/asm/unistd_64.h
> @@ -653,6 +653,10 @@ __SYSCALL(__NR_dup3, sys_dup3)
> __SYSCALL(__NR_pipe2, sys_pipe2)
> #define __NR_inotify_init1 294
> __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
> +#define __NR_preadv 295
> +__SYSCALL(__NR_preadv, sys_preadv)
> +#define __NR_pwritev 296
> +__SYSCALL(__NR_pwritev, sys_pwritev)
>
>
> #ifndef __NO_STUBS
> diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
> index d44395f..a1a5506 100644
> --- a/arch/x86/kernel/syscall_table_32.S
> +++ b/arch/x86/kernel/syscall_table_32.S
> @@ -332,3 +332,5 @@ ENTRY(sys_call_table)
> .long sys_dup3 /* 330 */
> .long sys_pipe2
> .long sys_inotify_init1
> + .long sys_preadv
> + .long sys_pwritev
> diff --git a/fs/compat.c b/fs/compat.c
> index e5f49f5..3a25cf3 100644
> --- a/fs/compat.c
> +++ b/fs/compat.c
> @@ -1214,6 +1214,67 @@ out:
> return ret;
> }
>
> +asmlinkage ssize_t
> +compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
> + unsigned long vlen, loff_t pos)
> +{
> + struct file *file;
> + ssize_t ret = -EBADF;
> +
> + if (pos < 0)
> + return -EINVAL;
> +
> + file = fget(fd);
> + if (!file)
> + return -EBADF;
> +
> + if (!(file->f_mode & FMODE_READ))
> + goto out;
> +
> + ret = -EINVAL;
> + if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
> + goto out;
> +
> + ret = compat_do_readv_writev(READ, file, vec, vlen, &pos);
> +
> +out:
> + if (ret > 0)
> + add_rchar(current, ret);
> + inc_syscr(current);
> + fput(file);
> + return ret;
> +}
> +
> +asmlinkage ssize_t
> +compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
> + unsigned long vlen, loff_t pos)
> +{
> + struct file *file;
> + ssize_t ret = -EBADF;
> +
> + if (pos < 0)
> + return -EINVAL;
> +
> + file = fget(fd);
> + if (!file)
> + return -EBADF;
> + if (!(file->f_mode & FMODE_WRITE))
> + goto out;
> +
> + ret = -EINVAL;
> + if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
> + goto out;
> +
> + ret = compat_do_readv_writev(WRITE, file, vec, vlen, &pos);
> +
> +out:
> + if (ret > 0)
> + add_wchar(current, ret);
> + inc_syscw(current);
> + fput(file);
> + return ret;
> +}
> +
> asmlinkage long
> compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,
> unsigned int nr_segs, unsigned int flags)
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 969a6d9..89f273d 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -701,6 +701,54 @@ sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
> return ret;
> }
>
> +asmlinkage ssize_t sys_preadv(unsigned long fd, const struct iovec __user *vec,
> + unsigned long vlen, loff_t pos)
> +{
> + struct file *file;
> + ssize_t ret = -EBADF;
> + int fput_needed;
> +
> + if (pos < 0)
> + return -EINVAL;
> +
> + file = fget_light(fd, &fput_needed);
> + if (file) {
> + ret = -ESPIPE;
> + if (file->f_mode & FMODE_PREAD)
> + ret = vfs_readv(file, vec, vlen, &pos);
> + fput_light(file, fput_needed);
> + }
> +
> + if (ret > 0)
> + add_rchar(current, ret);
> + inc_syscr(current);
> + return ret;
> +}
> +
> +asmlinkage ssize_t sys_pwritev(unsigned long fd, const struct iovec __user *vec,
> + unsigned long vlen, loff_t pos)
> +{
> + struct file *file;
> + ssize_t ret = -EBADF;
> + int fput_needed;
> +
> + if (pos < 0)
> + return -EINVAL;
> +
> + file = fget_light(fd, &fput_needed);
> + if (file) {
> + ret = -ESPIPE;
> + if (file->f_mode & FMODE_PWRITE)
> + ret = vfs_writev(file, vec, vlen, &pos);
> + fput_light(file, fput_needed);
> + }
> +
> + if (ret > 0)
> + add_wchar(current, ret);
> + inc_syscw(current);
> + return ret;
> +}
> +
> static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
> size_t count, loff_t max)
> {
> --
> 1.5.6.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-arch" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
--
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
git://git.kernel.org/pub/scm/docs/man-pages/man-pages.git
man-pages online: http://www.kernel.org/doc/man-pages/online_pages.html
Found a bug? http://www.kernel.org/doc/man-pages/reporting_bugs.html

2008-12-14 11:49:47

by Heiko Carstens

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

On Fri, Dec 12, 2008 at 08:29:29AM -0700, Matthew Wilcox wrote:
> On Fri, Dec 12, 2008 at 03:00:40PM +0100, Gerd Hoffmann wrote:
> > The patch sports the actual system call implementation and the windup in
> > the x86 system call tables. Other archs are TBD.
>
> > +asmlinkage ssize_t sys_preadv(unsigned long fd, const struct iovec __user *vec,
> > + unsigned long vlen, loff_t pos)
> > +asmlinkage ssize_t sys_pwritev(unsigned long fd, const struct iovec __user *vec,
> > + unsigned long vlen, loff_t pos)
>
> Are these prototypes required? MIPS and PARISC will need wrappers to
> fix them if they are. These two architectures have an ABI which
> requires 64-bit arguments to be passed in aligned pairs of registers,
> but glibc doesn't know that (and given the existence of syscall(3),
> can't do much about it even if it knew), so some of the arguments end up
> in the wrong registers.
>
> Things will go much better if we can prototype these as:
>
> asmlinkage ssize_t sys_preadv(unsigned int fd, const struct iovec __user *vec,
> loff_t pos, unsigned long vlen);
> asmlinkage ssize_t sys_pwritev(unsigned int fd, const struct iovec __user *vec,
> loff_t pos, unsigned long vlen);
>
> That way 'pos' ends up split between arg2 and arg3 and vlen ends up in
> arg4 instead of having vlen in arg2 and pos in arg3 and arg4 which then
> have to be munged to be in arg4 and arg5 by a compat wrapper.
>
> I seem to recall the s390 folks having some concerns with this kind of
> thing too, but I forget what they are, so I'll let them weigh in on
> this.

I think a lot of stuff that needs to be known for new system calls was
written up with these two postings:

http://marc.info/?l=linux-arch&m=118277150812137&w=2
http://lkml.org/lkml/2007/8/1/354

2008-12-15 04:57:31

by Paul Mackerras

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

Matthew Wilcox writes:

> On Fri, Dec 12, 2008 at 03:00:40PM +0100, Gerd Hoffmann wrote:
> > The patch sports the actual system call implementation and the windup in
> > the x86 system call tables. Other archs are TBD.
>
> > +asmlinkage ssize_t sys_preadv(unsigned long fd, const struct iovec __user *vec,
> > + unsigned long vlen, loff_t pos)
> > +asmlinkage ssize_t sys_pwritev(unsigned long fd, const struct iovec __user *vec,
> > + unsigned long vlen, loff_t pos)
>
> Are these prototypes required? MIPS and PARISC will need wrappers to
> fix them if they are. These two architectures have an ABI which
> requires 64-bit arguments to be passed in aligned pairs of registers,

As does 32-bit PowerPC, so I also would prefer the alternate argument
order for the syscall (pos as the 3rd argument).

Paul.

2008-12-15 06:20:52

by David Miller

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

From: Paul Mackerras <[email protected]>
Date: Mon, 15 Dec 2008 15:14:18 +1100

> Matthew Wilcox writes:
>
> > On Fri, Dec 12, 2008 at 03:00:40PM +0100, Gerd Hoffmann wrote:
> > > The patch sports the actual system call implementation and the windup in
> > > the x86 system call tables. Other archs are TBD.
> >
> > > +asmlinkage ssize_t sys_preadv(unsigned long fd, const struct iovec __user *vec,
> > > + unsigned long vlen, loff_t pos)
> > > +asmlinkage ssize_t sys_pwritev(unsigned long fd, const struct iovec __user *vec,
> > > + unsigned long vlen, loff_t pos)
> >
> > Are these prototypes required? MIPS and PARISC will need wrappers to
> > fix them if they are. These two architectures have an ABI which
> > requires 64-bit arguments to be passed in aligned pairs of registers,
>
> As does 32-bit PowerPC, so I also would prefer the alternate argument
> order for the syscall (pos as the 3rd argument).

FWIW 32-bit sparc does not have the aligned register requirement
for 64-bit arguments.

2008-12-15 15:47:22

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

On Fri 2008-12-12 20:56:00, Gerd Hoffmann wrote:
> Russell King wrote:
> >> should be similar to pread/pwrite, e.g:
> >>
> >> int preadv(fd, iovec, iovec_size, offset)
> >
> > Yes, and that's easy for glibc to achieve.
>
> This hints the ABI problem exists at syscall level only. Is that
> correct? So we can have
>
> preadv(fd, vec, vlen, off)
>
> argument ordering at app <-> glibc level and
>
> preadv(fd, vec, off, vlen)
>
> ordering at glibc <-> kernel (aka syscall) level and it works fine for
> ARM + MIPS + PARISC?

ltrace and strace would show different values; very misleading :-(.

--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

2008-12-15 16:37:50

by Jennifer Pioch

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

On 12/12/08, Gerd Hoffmann <[email protected]> wrote:
> Matthew Wilcox wrote:
> > On the other hand, NetBSD have approximately 0% market share.
> > We shouldn't let them lock us into making a bad decision. Is there
> > anyone other than NetBSD who has added these syscalls?
>
>
> Free- and OpenBSD have it too. For Solaris I've found a feature request
> only. Dunno about MacOS/Darwin. Other un*xes which are important these
> days?

Do you know the ID of the feature request?

Jenny
--
Jennifer Pioch, Uni Frankfurt

2008-12-15 20:43:37

by Gerd Hoffmann

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

Jennifer Pioch wrote:
> On 12/12/08, Gerd Hoffmann <[email protected]> wrote:
>> Free- and OpenBSD have it too. For Solaris I've found a feature request
>> only. Dunno about MacOS/Darwin. Other un*xes which are important these
>> days?
>
> Do you know the ID of the feature request?

#1167819 @ bugs.opensolaris.org

HTH,
Gerd

2008-12-16 09:57:39

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH v2] Add preadv and pwritev system calls.

On Monday 15 December 2008, Gerd Hoffmann wrote:
> Jennifer Pioch wrote:
> > On 12/12/08, Gerd Hoffmann <[email protected]> wrote:
> >> Free- and OpenBSD have it too. ?For Solaris I've found a feature request
> >> ?only. ?Dunno about MacOS/Darwin. ?Other un*xes which are important these
> >> ?days?
> >
> > Do you know the ID of the feature request?
>
> #1167819 @ bugs.opensolaris.org
>

Looks like they've been working on it for some time:
"Submit Date 25-MAY-1994"!

Arnd <><

2008-12-17 02:12:37

by Dan Mick

[permalink] [raw]
Subject: Re: [osol-code] [PATCH v2] Add preadv and pwritev system calls.

Arnd Bergmann wrote:
> On Monday 15 December 2008, Gerd Hoffmann wrote:
>> Jennifer Pioch wrote:
>>> On 12/12/08, Gerd Hoffmann <[email protected]> wrote:
>>>> Free- and OpenBSD have it too. For Solaris I've found a feature request
>>>> only. Dunno about MacOS/Darwin. Other un*xes which are important these
>>>> days?
>>> Do you know the ID of the feature request?
>> #1167819 @ bugs.opensolaris.org
>>
>
> Looks like they've been working on it for some time:
> "Submit Date 25-MAY-1994"!
>
> Arnd <><

No one's expended any serious time on it at all, it looks like, other than
discussion.

I see the one comment says that it can be done with lio_listio(3RT) (apparently
in LIO_WAIT mode), and that interface does seem to offer all the preadv/pwritev
functionality. That's not to say that we shouldn't implement preadv/pwritev
literally, just that there should be a way to get that sort of functionality today.