Hello,
this patch against 2.5.29 adds the async-io API as from latest Ben's
patch.
I find the dynamic syscall approch in some vendor kernel out there
that implements a /proc/libredhat unacceptable since it's not forward
compatible with 2.5:
@@ -636,6 +637,12 @@
.long SYMBOL_NAME(sys_ni_syscall) /* reserved for fremovexattr */
.long SYMBOL_NAME(sys_tkill)
+ .rept __NR_sys_dynamic_syscall-(.-sys_call_table)/4
+ .long SYMBOL_NAME(sys_ni_syscall)
+ .endr
+ .long SYMBOL_NAME(sys_dynamic_syscall)
+ .long SYMBOL_NAME(sys_io_submit)
+
.rept NR_syscalls-(.-sys_call_table)/4
.long SYMBOL_NAME(sys_ni_syscall)
.endr
diff -urN v2.4.19-pre5/include/asm-i386/unistd.h
linux.diff/include/asm-i386/unistd.h
--- v2.4.19-pre5/include/asm-i386/unistd.h Wed Apr 3 21:04:38 2002
+++ linux.diff/include/asm-i386/unistd.h Sat May 18 11:44:01 2002
@@ -245,6 +245,9 @@
#define __NR_tkill 238
+#define __NR_sys_dynamic_syscall 250
+#define __NR_io_submit 251
+
/* user-visible error numbers are in the range -1 - -124: see
* <asm-i386/errno.h> */
to try not to execute random code they use a magic number choosen at
compile time from /dev/urandom, so the probability to execute random
code is low but still there's a chance. For the io_sumbit I'm not even
sure if it's using the magic anymore (I guess checking the cookie
payload was a showstopper performance hit, in some older patch the
io_sumbit operation was passing through the slowdown of the dynamic
syscall but infact the new code does this:
+asmlinkage long vsys_io_submit(aio_context_t ctx_id, long nr, struct iocb **iocbpp)
+{
+ long res;
+ __asm__ volatile ("int $0x80"
+ : "=a" (res)
+ : "0" (__NR_io_submit), "b" (ctx_id), "c" (nr),
+ "d" (iocbpp));
+ return res;
+}
). So I would ask if you could merge the below interface into 2.5 so we can
ship a real async-io with real syscalls in 2.4, there's not much time to
change it given this is just used in production userspace today. I
prepared a patch against 2.5.29. Ben, I would appreciate if you could
review and confirm you're fine with it too.
BTW, I'm not the author of the API, and personally I dislike the
sys_io_sumbit approch, the worst part is the multiplexing of course:
+ if (IOCB_CMD_PREAD == tmp.aio_lio_opcode) {
+ op = file->f_op->aio_read;
+ if (unlikely(!(file->f_mode & FMODE_READ)))
+ goto out_put_req;
+ } else if (IOCB_CMD_PREADX == tmp.aio_lio_opcode) {
+ op = file->f_op->aio_readx;
+ if (unlikely(!(file->f_mode & FMODE_READ)))
+ goto out_put_req;
+ } else if (IOCB_CMD_PWRITE == tmp.aio_lio_opcode) {
+ op = file->f_op->aio_write;
+ if (unlikely(!(file->f_mode & FMODE_WRITE)))
+ goto out_put_req;
+ } else if (IOCB_CMD_FSYNC == tmp.aio_lio_opcode) {
+ op = file->f_op->aio_fsync;
+ } else if (IOCB_CMD_POLL == tmp.aio_lio_opcode) {
+ op = generic_aio_poll;
+ } else
+ op = NULL;
instead of separate syscalls for the various async_io
PREAD/PREADX/PWRITE/FSYNC/POLL operations there is just a single entry
point and a parameters specify the operation. But this is what the
current userspace expects and I wouldn't have too much time to change it
anyways because then I would break all the userspace libs too (I just
break them because of the true syscalls instead of passing through the
/proc/libredhat that calls into the dynamic syscall, but that's not
too painful to adapt). And after all even the io_submit isn't too bad
besides the above slowdown in the multiplexing (at least it's sharing
some icache for top/bottom of the functionality).
checked that it still compiles fine on x86 (all other archs should keep
compiling too). available also from here:
http://www.us.kernel.org/pub/linux/kernel/people/andrea/patches/v2.5/2.5.29/aio-api-1
Comments are welcome, many thanks.
diff -urNp 2.5.29/arch/i386/kernel/entry.S aio-api-1/arch/i386/kernel/entry.S
--- 2.5.29/arch/i386/kernel/entry.S Sat Jul 27 06:07:21 2002
+++ aio-api-1/arch/i386/kernel/entry.S Tue Jul 30 05:23:46 2002
@@ -753,6 +753,12 @@ ENTRY(sys_call_table)
.long sys_sched_setaffinity
.long sys_sched_getaffinity
.long sys_set_thread_area
+ .long sys_io_setup
+ .long sys_io_destroy /* 245 */
+ .long sys_io_submit
+ .long sys_io_cancel
+ .long sys_io_wait
+ .long sys_io_getevents
.rept NR_syscalls-(.-sys_call_table)/4
.long sys_ni_syscall
diff -urNp 2.5.29/fs/Makefile aio-api-1/fs/Makefile
--- 2.5.29/fs/Makefile Wed Jul 17 02:13:47 2002
+++ aio-api-1/fs/Makefile Tue Jul 30 05:25:03 2002
@@ -15,7 +15,7 @@ obj-y := open.o read_write.o devices.o f
namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
filesystems.o namespace.o seq_file.o xattr.o libfs.o \
- fs-writeback.o mpage.o direct-io.o
+ fs-writeback.o mpage.o direct-io.o aio.o
ifneq ($(CONFIG_NFSD),n)
ifneq ($(CONFIG_NFSD),)
diff -urNp 2.5.29/fs/aio.c aio-api-1/fs/aio.c
--- 2.5.29/fs/aio.c Thu Jan 1 01:00:00 1970
+++ aio-api-1/fs/aio.c Tue Jul 30 05:33:20 2002
@@ -0,0 +1,38 @@
+#include <linux/kernel.h>
+#include <linux/aio.h>
+#include <linux/time.h>
+#include <linux/errno.h>
+
+asmlinkage long sys_io_setup(unsigned nr_reqs, aio_context_t *ctxp)
+{
+ return -ENOSYS;
+}
+
+asmlinkage long sys_io_destroy(aio_context_t ctx)
+{
+ return -ENOSYS;
+}
+
+asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr, struct iocb **iocbpp)
+{
+ return -ENOSYS;
+}
+
+asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb *iocb)
+{
+ return -ENOSYS;
+}
+
+asmlinkage long sys_io_wait(aio_context_t ctx_id, struct iocb *iocb,
+ const struct timespec *timeout)
+{
+ return -ENOSYS;
+}
+
+asmlinkage long sys_io_getevents(aio_context_t ctx_id,
+ long nr,
+ struct io_event *events,
+ const struct timespec *timeout)
+{
+ return -ENOSYS;
+}
diff -urNp 2.5.29/include/asm-i386/unistd.h aio-api-1/include/asm-i386/unistd.h
--- 2.5.29/include/asm-i386/unistd.h Sun Apr 14 22:09:06 2002
+++ aio-api-1/include/asm-i386/unistd.h Tue Jul 30 05:22:38 2002
@@ -247,6 +247,13 @@
#define __NR_futex 240
#define __NR_sched_setaffinity 241
#define __NR_sched_getaffinity 242
+#define __NR_set_thread_area 243
+#define __NR_io_setup 244
+#define __NR_io_destroy 245
+#define __NR_io_submit 246
+#define __NR_io_cancel 247
+#define __NR_io_wait 248
+#define __NR_io_getevents 249
/* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
diff -urNp 2.5.29/include/linux/aio.h aio-api-1/include/linux/aio.h
--- 2.5.29/include/linux/aio.h Thu Jan 1 01:00:00 1970
+++ aio-api-1/include/linux/aio.h Tue Jul 30 05:32:30 2002
@@ -0,0 +1,6 @@
+#ifndef __LINUX__AIO_H
+#define __LINUX__AIO_H
+
+#include <linux/aio_abi.h>
+
+#endif /* __LINUX__AIO_H */
diff -urNp 2.5.29/include/linux/aio_abi.h aio-api-1/include/linux/aio_abi.h
--- 2.5.29/include/linux/aio_abi.h Thu Jan 1 01:00:00 1970
+++ aio-api-1/include/linux/aio_abi.h Tue Jul 30 05:57:23 2002
@@ -0,0 +1,86 @@
+/* linux/aio_abi.h
+ *
+ * Copyright 2000,2001,2002 Red Hat.
+ *
+ * Written by Benjamin LaHaise <[email protected]>
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation is hereby granted, provided that the above copyright
+ * notice appears in all copies. This software is provided without any
+ * warranty, express or implied. Red Hat makes no representations about
+ * the suitability of this software for any purpose.
+ *
+ * IN NO EVENT SHALL RED HAT BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
+ * SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF
+ * THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF RED HAT HAS BEEN ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * RED HAT DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND
+ * RED HAT HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+ * ENHANCEMENTS, OR MODIFICATIONS.
+ */
+#ifndef __LINUX__AIO_ABI_H
+#define __LINUX__AIO_ABI_H
+
+#include <asm/byteorder.h>
+
+typedef unsigned long aio_context_t;
+
+enum {
+ IOCB_CMD_PREAD = 0,
+ IOCB_CMD_PWRITE = 1,
+ IOCB_CMD_FSYNC = 2,
+ IOCB_CMD_FDSYNC = 3,
+ IOCB_CMD_PREADX = 4,
+ IOCB_CMD_POLL = 5,
+ IOCB_CMD_NOOP = 6,
+};
+
+/* read() from /dev/aio returns these structures. */
+struct io_event {
+ __u64 data; /* the data field from the iocb */
+ __u64 obj; /* what iocb this event came from */
+ __s64 res; /* result code for this event */
+ __s64 res2; /* secondary result */
+};
+
+#if defined(__LITTLE_ENDIAN)
+#define PADDED(x,y) x, y
+#elif defined(__BIG_ENDIAN)
+#define PADDED(x,y) y, x
+#else
+#error edit for your odd byteorder.
+#endif
+
+/*
+ * we always use a 64bit off_t when communicating
+ * with userland. its up to libraries to do the
+ * proper padding and aio_error abstraction
+ */
+
+struct iocb {
+ /* these are internal to the kernel/libc. */
+ __u64 aio_data; /* data to be returned in event's data */
+ __u32 PADDED(aio_key, aio_reserved1);
+ /* the kernel sets aio_key to the req # */
+
+ /* common fields */
+ __u16 aio_lio_opcode; /* see IOCB_CMD_ above */
+ __s16 aio_reqprio;
+ __u32 aio_fildes;
+
+ __u64 aio_buf;
+ __u64 aio_nbytes;
+ __s64 aio_offset;
+
+ /* extra parameters */
+ __u64 aio_reserved2;
+ __u64 aio_reserved3;
+}; /* 64 bytes */
+
+#undef IFBIG
+#undef IFLITTLE
+
+#endif /* __LINUX__AIO_ABI_H */
Andrea
On Tue, Jul 30, 2002 at 07:41:11AM +0200, Andrea Arcangeli wrote:
> I find the dynamic syscall approch in some vendor kernel out there
> that implements a /proc/libredhat unacceptable since it's not forward
> compatible with 2.5:
What is /proc/libredhat supposed to be? It hasn't ever been part of the
AIO patches.
> ). So I would ask if you could merge the below interface into 2.5 so we can
> ship a real async-io with real syscalls in 2.4, there's not much time to
> change it given this is just used in production userspace today. I
> prepared a patch against 2.5.29. Ben, I would appreciate if you could
> review and confirm you're fine with it too.
Please don't. First Ben has indicated on kernel summit that the abi might
change and I think it's a bad idea to lock him into the old ABI just because
suse doesn't want to have something called libredhat.so* in /lib.
Alternate suggestion: rename it to libunited.so.
And even if there is a syscall reservation the way to do it is not to add
the real syscall names to entry.S and implement stubs but to use
sys_ni_syscall.
> BTW, I'm not the author of the API, and personally I dislike the
> sys_io_sumbit approch, the worst part is the multiplexing of course:
Okay. So you think the API is stupid but want it to get in without
discussion??
If you really want to ship the old-style AIO (of which I remember ben
saying it it broken for everything post-2.4.9) please stick to the patch
Ben has around, otherwise wait for the proper 2.5 solution. I have my
doubts that it is backportable, though.
On Tue, 30 Jul 2002, Andrea Arcangeli wrote:
> I find the dynamic syscall approch in some vendor kernel out there
> that implements a /proc/libredhat unacceptable since it's not forward
> compatible with 2.5:
How do you know this one will be compatible with 2.5 ?
You yourself had suggestions for improving the interface
and I wouldn't be surprised if at least some of those
would get merged for 2.5 and would end up changing the
interface ;)
regards,
Rik
--
Bravely reimplemented by the knights who say "NIH".
http://www.surriel.com/ http://distro.conectiva.com/
On Tue, Jul 30, 2002 at 07:41:11AM +0200, Andrea Arcangeli wrote:
> instead of separate syscalls for the various async_io
> PREAD/PREADX/PWRITE/FSYNC/POLL operations there is just a single entry
> point and a parameters specify the operation. But this is what the
> current userspace expects and I wouldn't have too much time to change it
> anyways because then I would break all the userspace libs too (I just
> break them because of the true syscalls instead of passing through the
> /proc/libredhat that calls into the dynamic syscall, but that's not
> too painful to adapt). And after all even the io_submit isn't too bad
> besides the above slowdown in the multiplexing (at least it's sharing
> some icache for top/bottom of the functionality).
What would you suggest as an alternative API? The main point of multiplexing
is that ios can be submitted in batches, which can't be done if the ios are
submitted via individual syscalls, not to mention the overlap with the posix
aio api.
> checked that it still compiles fine on x86 (all other archs should keep
> compiling too). available also from here:
>
> http://www.us.kernel.org/pub/linux/kernel/people/andrea/patches/v2.5/2.5.29/aio-api-1
>
> Comments are welcome, many thanks.
That's the old cancellation API. Anyways, the core is pretty much ready, so
don't bother with this patch.
-ben
On Tue, 30 Jul 2002, Christoph Hellwig wrote:
>
> And even if there is a syscall reservation the way to do it is not to add
> the real syscall names to entry.S and implement stubs but to use
> sys_ni_syscall.
Note that something needs to get moving on this rsn, I'm not interested in
getting aio patches on Oct 30th. The feature freeze may be on Halloween,
but if I get some big feature just days before I'm likely to just say
"screw it".
I think we can still change the stuff in 2.5.x, but I really want to start
seeing some code, so that I'm not taken by surprise by something that
obviously sucks.
Is there any activity on linux-aio? I haven't heard anything since Ottawa.
Linus
On Tue, Jul 30, 2002 at 08:49:39AM -0400, Benjamin LaHaise wrote:
>
> Anyways, the core is pretty much ready, so
Hey Ben, That sounds great. Have been looking forward to it
to find out how much has changed and if you've left anything
for us to do :) (other than docs and driver fixes :( )
I did have an updated version of the bio traversal patch
(for 2.5.29) that avoids modifications to the bv_offset/bv_len
fields by the block layer, though I don't know if you
still need it. Besides, you probably wouldn't run into
those cases often, as the partial request completions
are probably rare. But just as a fyi ...
Regards
Suparna
>
On Tue, 30 Jul 2002, Andrea Arcangeli wrote:
>
> this patch against 2.5.29 adds the async-io API as from latest Ben's
> patch.
Why not make the io_submit system call number 251 like it apparently is
already in 2.4.x? We're really close to it anyway, so if you just re-order
the system calls a bit (and leave 250 as sys_ni_syscall), you're basically
there.
Other than that it looks good.
Linus
On Tue, Jul 30, 2002 at 06:40:43AM -0700, Linus Torvalds wrote:
> I think we can still change the stuff in 2.5.x, but I really want to start
> seeing some code, so that I'm not taken by surprise by something that
> obviously sucks.
Sorry, I was away last week. I'm updating patches to 2.5.29, and should have
them ready by the afternoon for people to comment on. There are a couple of
things to check on ia64 and x86-64 ABI-wise, and people need to comment on the
in-kernel f_ops->read/write changes.
-ben
--
"You will be reincarnated as a toad; and you will be much happier."
On Tue, Jul 30, 2002 at 06:34:53AM -0700, Linus Torvalds wrote:
>
>
> On Tue, 30 Jul 2002, Andrea Arcangeli wrote:
> >
> > this patch against 2.5.29 adds the async-io API as from latest Ben's
> > patch.
>
> Why not make the io_submit system call number 251 like it apparently is
> already in 2.4.x? We're really close to it anyway, so if you just re-order
> the system calls a bit (and leave 250 as sys_ni_syscall), you're basically
> there.
>
> Other than that it looks good.
thank you very much for checking it. Since Ben asked for waiting his
patch you can reject may patch, that's really fine with me as far as it
doesn't take months for his patch to showup. my patch is in perfect sync
with his latest code on the web.
as said I never claimed current API is stupid as Christph understood, I
said I'd preferred a sys_aio_read/write/fsync etc... but I could live
fine with sys_io_submit too, it wasn't too bad enough to make me rewrite
it.
With my patch I mainly wanted to raise eyes on this issue so we can
hopefully get an API registered in a few weeks in mainline. I'm
completely flexbile to rewrite the API too if anybody find good reasons
for it (or if you say, sys_io_submit is too ugly please change to
sys_aio_read/write/etc..).
As Ben said the API is the only thing that is been mostly stable so far,
this is one more reason I felt this is the right way to proceed instead
of building the dynamic syscall slowdown overhead layer that as best
(unsure for sys_io_sumbit 250) is forward binary compatible with 2.5 by
pure luck.
thanks,
Andrea
On Tue, Jul 30, 2002 at 09:11:40AM +0100, Christoph Hellwig wrote:
> On Tue, Jul 30, 2002 at 07:41:11AM +0200, Andrea Arcangeli wrote:
> > I find the dynamic syscall approch in some vendor kernel out there
> > that implements a /proc/libredhat unacceptable since it's not forward
> > compatible with 2.5:
>
> What is /proc/libredhat supposed to be? It hasn't ever been part of the
> AIO patches.
you should read the code then (from the latest aio-20020619.diff).
diff -urN v2.4.19-pre5/Makefile linux.diff/Makefile
--- v2.4.19-pre5/Makefile Wed Apr 3 21:04:25 2002
+++ linux.diff/Makefile Fri Apr 19 20:57:16 2002
@@ -226,7 +226,7 @@
drivers/sound/pndsperm.c \
drivers/sound/pndspini.c \
drivers/atm/fore200e_*_fw.c drivers/atm/.fore200e_*.fw \
- .version .config* config.in config.old \
+ .uniquebytes .version .config* config.in config.old \
scripts/tkparse scripts/kconfig.tk scripts/kconfig.tmp \
scripts/lxdialog/*.o scripts/lxdialog/lxdialog \
.menuconfig.log \
@@ -268,6 +268,7 @@
--end-group \
-o vmlinux
$(NM) vmlinux | grep -v '\(compiled\)\|\(\.o$$\)\|\( [aUw]
\)\|\(\.\.ng$$\)\|\(LASH[RL]DI\)' | sort > System.map
+ @$(MAKE) -C ulib
symlinks:
rm -f include/asm
@@ -296,7 +297,7 @@
linuxsubdirs: $(patsubst %, _dir_%, $(SUBDIRS))
-$(patsubst %, _dir_%, $(SUBDIRS)) : dummy include/linux/version.h
include/config/MARKER
+$(patsubst %, _dir_%, $(SUBDIRS)) : dummy include/linux/compile.h
include/config/MARKER
$(MAKE) CFLAGS="$(CFLAGS) $(CFLAGS_KERNEL)" -C $(patsubst
_dir_%, %, $@)
$(TOPDIR)/include/linux/version.h: include/linux/version.h
@@ -322,6 +323,11 @@
echo \#define LINUX_COMPILE_DOMAIN ; \
fi >> .ver
@echo \#define LINUX_COMPILER \"`$(CC) $(CFLAGS) -v 2>&1 | tail
-1`\" >> .ver
+ @rm -f .uniquebytes
+ @dd if=/dev/urandom of=.uniquebytes bs=1 count=16
+ @echo -n \#"define LINUX_UNIQUE_BYTES " >>.ver
+ @hexdump -v -e '1/1 "0x%02x, "' .uniquebytes | sed -e 's/,
$$//g' >>.ver
+ @echo "" >>.ver
@mv -f .ver $@
include/linux/version.h: ./Makefile
@@@ -404,6 +410,8 @@
.PHONY: $(patsubst %, _modinst_%, $(SUBDIRS))
$(patsubst %, _modinst_%, $(SUBDIRS)) :
$(MAKE) -C $(patsubst _modinst_%, %, $@) modules_install
+ mkdir -p $(INSTALL_MOD_PATH)/lib/kernel/$(KERNELRELEASE)/
+ install -m 755 ulib/libredhat-kernel.so.1.0.1 $(INSTALL_MOD_PATH)/lib/kernel/$(KERNELRELEASE)/
# modules disabled....
diff -urN v2.4.19-pre5/ulib/Makefile linux.diff/ulib/Makefile
--- v2.4.19-pre5/ulib/Makefile Wed Dec 31 19:00:00 1969
+++ linux.diff/ulib/Makefile Fri Apr 19 20:58:01 2002
@@ -0,0 +1,50 @@
+# Makefile - libredhat-kernel.so build code.
+#
+# Copyright 2002 Red Hat, Inc. All Rights Reserved.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free
Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA
+#
+#
+all: libredhat-kernel.so
+
+ASFLAGS=-D__KERNEL__ -D__ASSEMBLY__ -I../include -nostdlib -nostartfiles
+CFLAGS=-D__KERNEL__ -I../include -nostdlib -nostartfiles
+
+so_objs=vsysaddr.o kso_init.o
+
+vsysaddr.S: ../System.map stub.S Makefile
+ rm -f vsysaddr.S
+ echo '#include "stub.S"' >vsysaddr.S
+ awk -- "/^00000000bfff.* vsys_/ { print \"dynamic_syscall(\"\$$3 \",0x\" \$$1 \")\"; }" <../System.map >>vsysaddr.S
+ awk -- "/^bfff.* vsys_/ { print \"dynamic_syscall(\"\$$3 \",0x\" \$$1 \")\"; }" <../System.map >>vsysaddr.S
+
+vsysaddr.o: vsysaddr.S
+
+kso_init.o: ../include/linux/compile.h
+
+libredhat-kernel.so.1.0.1: $(so_objs) libredhat-kernel.map
+ gcc -nostdlib -nostartfiles -shared
-Wl,--version-script=libredhat-kernel.map
-Wl,-soname=libredhat-kernel.so.1 -o $@ $(so_objs)
+ cp $@ [email protected]
+ strip $@
+
+libredhat-kernel.so: libredhat-kernel.so.1.0.1
+ ln -sf $< $@
+
+clean:
+ rm -f *.o libredhat-kernel.so myln libredhat-kernel.so.1*
vsysaddr.S
+
+# test app
+myln: myln.c libredhat-kernel.so Makefile
+ cc -g -o myln myln.c -L. -lredhat-kernel
diff -urN v2.4.19-pre5/ulib/README linux.diff/ulib/README
--- v2.4.19-pre5/ulib/README Wed Dec 31 19:00:00 1969
+++ linux.diff/ulib/README Fri Apr 19 20:54:05 2002
@@ -0,0 +1,2 @@
+The libredhat-kernel code is provided under the terms of the LGPL.
+See the file COPYING for details.
diff -urN v2.4.19-pre5/ulib/kso_init.c linux.diff/ulib/kso_init.c
--- v2.4.19-pre5/ulib/kso_init.c Wed Dec 31 19:00:00 1969
+++ linux.diff/ulib/kso_init.c Fri Apr 19 20:54:05 2002
@@ -0,0 +1,67 @@
+/* kso_init.c - libredhat-kernel.so startup code.
+
+ Copyright 2002 Red Hat, Inc. All Rights Reserved.
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
USA
+
+ */
+#include <linux/compile.h>
+#include <linux/types.h>
+#include <asm/unistd.h>
+#include <asm/fcntl.h>
+#include <asm/mman.h>
+#include <asm/a.out.h>
+
+char libredhat_kernel_enosys = 1; /* the asm in stub.S depends on
this */
+
+long _init(void)
+{
+ static char unique[] = { LINUX_UNIQUE_BYTES };
+ int errno;
+ long addr;
+ int fd;
+ int i;
+
+ _syscall6(int, mmap2, unsigned long, addr, unsigned long, len,
+ unsigned long, prot, unsigned long, flags,
+ unsigned long, fd, unsigned long, pgoff)
+ _syscall2(long, munmap, unsigned long, addr, size_t, len)
+ _syscall2(int, open, const char *, name, int, flags)
+ _syscall1(int, close, int, fd)
+
+ if (sizeof(unique) != 16)
+ return -1;
+
+ fd = open("/dev/vsys", O_RDONLY);
+ if (-1 == fd)
+ return -1;
+
+ addr = mmap2(0, VSYSCALL_SIZE, PROT_READ | PROT_EXEC,
MAP_SHARED, fd, 0);
+ if (-1 == addr)
+ return -1;
+
+ close(fd);
+
+ for (i=0; i<sizeof(unique); i++)
+ if (unique[i] != ((char *)addr)[i]) {
+ munmap(addr, VSYSCALL_SIZE);
+ return -1;
+ }
+
+ /* okay, all the syscalls we provide are now good */
+ libredhat_kernel_enosys = 0;
+ return 0;
+}
+
diff -urN v2.4.19-pre5/ulib/libredhat-kernel.map
linux.diff/ulib/libredhat-kernel.map
--- v2.4.19-pre5/ulib/libredhat-kernel.map Wed Dec 31 19:00:00 1969
+++ linux.diff/ulib/libredhat-kernel.map Tue Apr 2 18:56:58 2002
@@ -0,0 +1,11 @@
+REDHAT_0.90 {
+ global:
+ vsys_io_setup;
+ vsys_io_destroy;
+ vsys_io_submit;
+ vsys_io_cancel;
+ vsys_io_wait;
+ vsys_io_getevents;
+ local:
+ *;
+};
diff -urN v2.4.19-pre5/ulib/myln.c linux.diff/ulib/myln.c
--- v2.4.19-pre5/ulib/myln.c Wed Dec 31 19:00:00 1969
+++ linux.diff/ulib/myln.c Tue Apr 2 18:56:58 2002
@@ -0,0 +1,25 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+int main ()
+{
+ long ctx = 0;
+ extern long vsys_io_setup(long, long *);
+ unsigned char *bob = (void*)&vsys_io_setup;
+ long ret;
+ int i;
+ printf("%p\n", bob);
+ //printf("%p\n", mmap(0, 65536, PROT_READ | PROT_EXEC,
MAP_SHARED,
+ // open("/dev/vsys", O_RDONLY), 0));
+ //for (i=0; i<16; i++)
+ // printf(" %02x\n", bob[i]);
+ //printf("\n");
+
+ ret = vsys_io_setup(100, &ctx);
+
+ printf("ret=%ld, ctx=0x%lx\n", ret, ctx);
+ return 0;
+}
diff -urN v2.4.19-pre5/ulib/stub.S linux.diff/ulib/stub.S
--- v2.4.19-pre5/ulib/stub.S Wed Dec 31 19:00:00 1969
+++ linux.diff/ulib/stub.S Fri Apr 19 20:54:05 2002
@@ -0,0 +1,38 @@
+/* stub.S - libredhat-kernel.so jump code.
+
+ Copyright 2002 Red Hat, Inc. All Rights Reserved.
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
USA
+
+ */
+/* stub.S */
+#include <asm/segment.h>
+#include <asm/errno.h>
+
+ .text
+
+#define dynamic_syscall(x,a) \
+ .globl x ;\
+ .type x, @function ;\
+ .align 16 ;\
+ x: ;\
+ cmpb $0,libredhat_kernel_enosys ;\
+ jne 1f ;\
+ ljmp $__USER_CS, $a ;\
+ 1: ;\
+ movl $-ENOSYS,%eax ;\
+ ret ;\
+ .size x,.-x
+
and the other funny parts:
+long sys_dynamic_syscall(struct pt_regs regs)
+{
+ struct dummy_args dummy_args;
+ struct vsyscall_entry *ent = (void *)regs.edx;
+ void *args = (void *)regs.ecx;
+ long ret;
+
+ pr_debug("ent = %p args = %p\n", ent, args);
+ pr_debug("eip = 0x%08lx\n", regs.eip);
+
+ if (unlikely(!current->mm->vsys_mapped))
+ goto err;
@ -231,6 +232,10 @@
/* Architecture-specific MM context */
mm_context_t context;
+
+ struct kioctx *ioctx_list;
+ unsigned long new_ioctx_id;
+ int vsys_mapped;
};
@ -243,6 +248,7 @@
mm_count: ATOMIC_INIT(1), \
mmap_sem: __RWSEM_INITIALIZER(name.mmap_sem), \
page_table_lock: SPIN_LOCK_UNLOCKED, \
+ vsys_mapped: 0, \
mmlist: LIST_HEAD_INIT(name.mmlist), \
}
etc... (oh yeah, it may not be in /proc but the location of this
bytecode doesn't change what it does)
> > ). So I would ask if you could merge the below interface into 2.5 so we can
> > ship a real async-io with real syscalls in 2.4, there's not much time to
> > change it given this is just used in production userspace today. I
> > prepared a patch against 2.5.29. Ben, I would appreciate if you could
> > review and confirm you're fine with it too.
>
> Please don't. First Ben has indicated on kernel summit that the abi might
> change and I think it's a bad idea to lock him into the old ABI just because
What I heard and that I remeber crystal clear is that Ben indicated that
the API isn't changing for a long time, and that's been stable so far,
I could imagine why.
> suse doesn't want to have something called libredhat.so* in /lib.
I don't mind about the libredhatname it's the above nosense overhead
code that isn't going to be included into my tree. It's just pure
overhead and complication that will only make it worse.
> Alternate suggestion: rename it to libunited.so.
That's not the problem sorry.
> And even if there is a syscall reservation the way to do it is not to add
> the real syscall names to entry.S and implement stubs but to use
> sys_ni_syscall.
If there is a sysacll reservation the way to do it is the way I did as
far I can tell, I don't see what you mean with using sys_ni_syscall or
whatever. Of course right now we miss the syscall reservation and that's
why I'm trying to register the API.
>
> > BTW, I'm not the author of the API, and personally I dislike the
> > sys_io_sumbit approch, the worst part is the multiplexing of course:
>
> Okay. So you think the API is stupid but want it to get in without
> discussion??
I didn't say it's stupid, you said it not me. I only said I would have
preferred it to be sys_aio_read sys_aio_write sys_aio_fsync etc...
rather than a sys_io_sumbit that takes read/write/fsync as parameter.
But I don't mind, it's a minor difference that's not going to hurt
performance too much, what I'm trying to do is to have an API registered
before one/two months, what kind of API is the last of my interests as
far as it's not stupid (and the current one isn't stupid unlike you
said), I can live with sys_io_sumbit or sys_aio_read/write/fsync, and
since everybody is using this current API at the moment I pushed for it
because I assume people is just used to it (userspace does), and because
it would been an additional crusade to try to also submit a different
API (as expected Ben is advocating for his API showing the good points
of it) and as said it's not too bad (and after all it's not us supposed
to deal with this API in the first place, we almost never use async-io
anyways and I don't see it changing tomorrow).
If it would been stupid I would had rewrote it like I did with part of
the internals that gave you root shell etc.., I'm not completely blind :).
> If you really want to ship the old-style AIO (of which I remember ben
> saying it it broken for everything post-2.4.9) please stick to the patch
> Ben has around, otherwise wait for the proper 2.5 solution. I have my
> doubts that it is backportable, though.
I'm trying to do my best to avoid having to merge the code I quoted
above, that's disgusting and since the api isn't gonna change anwyays
like Ben said I'm trying to do the right thing to avoid clashes with
syscall 250 as well.
Hope this clears my point.
One more thing: I will be completely fine if my patch goes to /dev/null
because Ben wants to make some change or because he wants to submit it
with a full blown implementation too, or even better because we change
the API after a collective thought. I just wanted to raise the
discussion and to have people focused on the API. If it takes a few
weeks or one month to get an API registered that's of course fine. If
it takes more I'll be forced to merge the code I quoted above and as
said I'm trying to avoid it. As far as I'm concerned only the API
matters to me, because the 2.4 internals that I merged aren't going to
be the right thing to do long term anyways, as Ben pointed out this
async-io isn't even asynchronous in submit_bh, so you cannot even use it
to fill multiple request queues when dealing with multiple spindle,
maybe that's fixed in the 2.5 version, just grep for changes in
ll_rw_block if there's some change there's a chance that such issue is
addressed. Blocking in VM is perfectly fine, I don't think we shouldn't
change that, we have to as Linus said, what we shouldn't do is to block
in submit_bh because that prevents users to use async-io to fill
multiple queues.
Really last thing: one of the major reasons I don't like the above code
besides the overhead and complexity it introduces is that it doesn't
guarantee 100% that it will be forward compatible with 2.5 applications
(the syscall 250 looks not to check even for the payload, I guess they
changed it because it was too slow to be forward compatible in most
cases), the /dev/urandom payload may match the user arguments if you're
unlucky and since we can guarantee correct operations by doing a syscall
registration, I don't see why we should make it work by luck.
Andrea
On Tue, Jul 30, 2002 at 06:43:20PM +0200, Andrea Arcangeli wrote:
> >
> > Please don't. First Ben has indicated on kernel summit that the abi might
> > change and I think it's a bad idea to lock him into the old ABI just because
>
> What I heard and that I remeber crystal clear is that Ben indicated that
> the API isn't changing for a long time, and that's been stable so far,
> I could imagine why.
I suspect what Christoph is remember is that the in-kernel API was still
in flux and up for discussion.
> I'm trying to do my best to avoid having to merge the code I quoted
> above, that's disgusting and since the api isn't gonna change anwyays
> like Ben said I'm trying to do the right thing to avoid clashes with
> syscall 250 as well.
syscall 250 isn't used in anything Red Hat shipped, that was a matter
of experimentation I was doing in recent aio development trees (which
is what the 2.4.18 patches are, as they still cause that VM to OOM under
rather trivial io patterns).
> Really last thing: one of the major reasons I don't like the above code
> besides the overhead and complexity it introduces is that it doesn't
> guarantee 100% that it will be forward compatible with 2.5 applications
> (the syscall 250 looks not to check even for the payload, I guess they
> changed it because it was too slow to be forward compatible in most
> cases), the /dev/urandom payload may match the user arguments if you're
> unlucky and since we can guarantee correct operations by doing a syscall
> registration, I don't see why we should make it work by luck.
You haven't looked at the code very closely then. It checks that the
payload matches, and that the caller is coming from the vsyscall pages.
Yes, the dynamic syscall thing is a horrific kludge that shouldn't be
used, but the vsyscall technique is rather useful. This is something
that x86-64 gets wrong by not requiring the vsyscall page to need an
mmap into the user's address space: UML cannot emulate vsyscalls by
faking the mmap.
-ben
--
"You will be reincarnated as a toad; and you will be much happier."
[email protected] said:
> This is something that x86-64 gets wrong by not requiring the
> vsyscall page to need an mmap into the user's address space: UML
> cannot emulate vsyscalls by faking the mmap.
Andrea and I talked about this a bit at KS.
IIRC, he wants vsyscall addresses to be hardcoded constants in libc. He
doesn't want the overhead of doing an indirect call through whatever
address you get from the vsyscall_mmap() syscall.
At first glance, that breaks any hope of UML being able to virtualize that.
Any vsyscall executed by a UML process will go straight into the host kernel,
completely bypassing UML.
We did come up with a scheme that sounded to me like it would work.
/me tries to remember what it was :-)
I think it was that we provide a syscall to move the vsyscall page. UML
will use that to relocate the host vsyscalls and map its own page there.
The final piece is that UML would be linked with a different vsyscall address.
Andrea, does that sound right?
I don't particularly like this scheme - the get-the-address-at-runtime
approach is far cleaner, but it does satisfy Andrea's need for speed.
Jeff
On Tue, Jul 30, 2002 at 02:10:35PM -0500, Jeff Dike wrote:
> We did come up with a scheme that sounded to me like it would work.
A constant address is still an option with an mmap'd device. Just do
an mmap of the device and assert that it is the correct value.
-ben
--
"You will be reincarnated as a toad; and you will be much happier."
On Tue, 30 Jul 2002, Benjamin LaHaise wrote:
> On Tue, Jul 30, 2002 at 02:10:35PM -0500, Jeff Dike wrote:
> > We did come up with a scheme that sounded to me like it would work.
>
> A constant address is still an option with an mmap'd device. Just do
> an mmap of the device and assert that it is the correct value.
That still doesn't get the TLB advantages of a globally shared page at the
same address.. It also has the overhead of mapping it, which you don't
have if the thing is just always in the address space, and all processes
just get created with that page mapped. That can be a big deal for process
startup latency for small processes.
Linus
On Tue, Jul 30, 2002 at 11:15:26AM -0700, Linus Torvalds wrote:
> That still doesn't get the TLB advantages of a globally shared page at the
> same address.. It also has the overhead of mapping it, which you don't
> have if the thing is just always in the address space, and all processes
> just get created with that page mapped. That can be a big deal for process
> startup latency for small processes.
That might be a concern once glibc startup can occur with less than a few
dozen calls to grope through the local files. ;-) Hmmm, it would be possible
to make the vsyscall page mapped by default and leave the global bit enabled
until UML forcibly unmapped it (and then clear the global bit and do a global
invalidate). Would that be acceptible?
-ben
--
"You will be reincarnated as a toad; and you will be much happier."
[email protected] said:
> A constant address is still an option with an mmap'd device. Just do
> an mmap of the device and assert that it is the correct value.
Yeah, but the point of mmapping it is to allow the kernel to choose where
it goes. The host kernel will choose one place for its processes. UML
will choose a different place for its processes. Everything is nice and
virtualizable.
Jeff
[email protected] said:
> Hmmm, it would be possible to make the vsyscall page mapped by
> default and leave the global bit enabled until UML forcibly unmapped
> it (and then clear the global bit and do a global invalidate). Would
> that be acceptible?
That sounds like it would work for me.
Jeff
On Tue, Jul 30, 2002 at 12:59:43PM -0400, Benjamin LaHaise wrote:
> is what the 2.4.18 patches are, as they still cause that VM to OOM under
> rather trivial io patterns).
I would like if you could reproduce with the aio in my tree after an:
echo 1 >/proc/sys/vm/vm_gfp_debug
that will give you stack traces that you should send back to me, and I
will tell you exactly what the problem is (if you can reproduce).
>
> > Really last thing: one of the major reasons I don't like the above code
> > besides the overhead and complexity it introduces is that it doesn't
> > guarantee 100% that it will be forward compatible with 2.5 applications
> > (the syscall 250 looks not to check even for the payload, I guess they
> > changed it because it was too slow to be forward compatible in most
> > cases), the /dev/urandom payload may match the user arguments if you're
> > unlucky and since we can guarantee correct operations by doing a syscall
> > registration, I don't see why we should make it work by luck.
>
> You haven't looked at the code very closely then. It checks that the
> payload matches, and that the caller is coming from the vsyscall pages.
I didn't noticed the caller needed to came from the vsyscall pages, that
makes it safer but still it's an huge complexity that you apparently
disabled in your test tree because it was harming performance.
> that x86-64 gets wrong by not requiring the vsyscall page to need an
> mmap into the user's address space: UML cannot emulate vsyscalls by
I don't want vma overhead in the rbtree, nor in the mm_struct, nor I
want mmap in general to deal with vsyscalls for obvious performance
reasons.
> faking the mmap.
the fix for uml is trivial, the simplest approch is to add a prctl that
disables vsyscalls for a certain process and that cannot be re-enabled
by the userspace (so a one-way prctl), the vsyscall will be swapped with
a vsyscall that invokes the real syscall and uml will trap gettimeofday
syscall like it does on x86. We also discussed some more complicated and
sophisticated approch but I like the prctl that forces the
gettimeofday/time syscalls because that could be used trivially for
strace too (of course ltrace will just show the gettimeofday call
because we pass through glibc, infact uml for 99% of cases could simply
use LD_PRELOAD, but Jeff didn't like it for good reasons: because it's
not transparent enough for userspace and of course it doesn't work with
statically linked binaries).
In short the prctl that redirects the program and all childs to use the
real syscall would be my preferred approch, as said the uml kernel
should still be able to use the vgettimeofday, only the childs (the
userspace running under the uml kernel) will be executed with the prctl
enabled and the fact userspace cannot disable the prctl (once enabled
before execve) will guarantee the system will function correctly. It
will require a per-task information and a switch_to hack that will
change the fixmap entry and inlvpg if needed.
Now I don't remeber anymore if I just suggested the above prctl way to
Jeff and he just found any weakness in it that could make it not a
feasible way for uml, but in such case he will remind me about it now :)
Infact we will use the same tecnique of using a vsyscall that redirect
to a real syscalls for all kind of vsyscalls that in some hardware may
need to know what cpu they are running on to return the result, this is
never been needed so far but it was one of the possibilities that our
vsyscall design offered.
Andrea
On Tue, Jul 30, 2002 at 08:49:39AM -0400, Benjamin LaHaise wrote:
> On Tue, Jul 30, 2002 at 07:41:11AM +0200, Andrea Arcangeli wrote:
> > instead of separate syscalls for the various async_io
> > PREAD/PREADX/PWRITE/FSYNC/POLL operations there is just a single entry
> > point and a parameters specify the operation. But this is what the
> > current userspace expects and I wouldn't have too much time to change it
> > anyways because then I would break all the userspace libs too (I just
> > break them because of the true syscalls instead of passing through the
> > /proc/libredhat that calls into the dynamic syscall, but that's not
> > too painful to adapt). And after all even the io_submit isn't too bad
> > besides the above slowdown in the multiplexing (at least it's sharing
> > some icache for top/bottom of the functionality).
>
> What would you suggest as an alternative API? The main point of multiplexing
> is that ios can be submitted in batches, which can't be done if the ios are
> submitted via individual syscalls, not to mention the overlap with the posix
> aio api.
yes, sys_io_sumbit has the advantage you can mix read/write/fsync etc..
in the same array of iocb. But by the same argument we could as well
have a submit_io instead of sys_read/sys_write/sys_fsync. It's a matter
of dropping such big if else if else if else loop and to scale with the
syscall table lookup instead. So I'd still prefer to nuke sys_io_submit
and iocb.aio_lio_opcode, and to replace them with
sys_aio_read/sys_aio_readx/sys_aio_write/sys_aio_fsync/sys_aio_poll but
as you said if it's very common to generate huge array of iocb with
mixed commands the current API would pay off thanks to the reduced
number of enter/exit kernel at the expense of the cheaper if else if
else checks. So I'm pretty much fine either ways and that's why I didn't
proposed a modified api since the first place.
> > checked that it still compiles fine on x86 (all other archs should keep
> > compiling too). available also from here:
> >
> > http://www.us.kernel.org/pub/linux/kernel/people/andrea/patches/v2.5/2.5.29/aio-api-1
> >
> > Comments are welcome, many thanks.
>
> That's the old cancellation API. Anyways, the core is pretty much ready, so
> don't bother with this patch.
Can you point me out to a patch with the new cancellation API that you
agree with for merging in 2.5 so I can synchronize? I'm reading your
very latest patch loaded on some site in June. that will be really
helpful, many thanks!
Andrea
On Tue, Jul 30, 2002 at 11:41:16PM +0200, Andrea Arcangeli wrote:
> Can you point me out to a patch with the new cancellation API that you
> agree with for merging in 2.5 so I can synchronize? I'm reading your
> very latest patch loaded on some site in June. that will be really
> helpful, many thanks!
Here is what I've got for the aio core that has the cancellation
change to return the completion event. The other slight change that
I meant to get in before going into the mainstream is to have the
timeout io_getevents takes be an absolute timeout, which helps for
applications that have specific deadlines they are attempting to
schedule to (think video playback). This drop is untested, but I'd
like it if people could provide comments on it.
-ben
--
"You will be reincarnated as a toad; and you will be much happier."
:r ~/patches/v2.5/v2.5.29-aio-core-A0.diff
diff -urN v2.5.29/arch/i386/kernel/entry.S aio-v2.5.29.diff/arch/i386/kernel/entry.S
--- v2.5.29/arch/i386/kernel/entry.S Tue Jul 30 10:24:32 2002
+++ aio-v2.5.29.diff/arch/i386/kernel/entry.S Tue Jul 30 17:23:04 2002
@@ -753,6 +753,11 @@
.long sys_sched_setaffinity
.long sys_sched_getaffinity
.long sys_set_thread_area
+ .long sys_io_setup
+ .long sys_io_destroy /* 245 */
+ .long sys_io_getevents_abs
+ .long sys_io_submit
+ .long sys_io_cancel
.rept NR_syscalls-(.-sys_call_table)/4
.long sys_ni_syscall
diff -urN v2.5.29/fs/Makefile aio-v2.5.29.diff/fs/Makefile
--- v2.5.29/fs/Makefile Tue Jul 30 09:46:21 2002
+++ aio-v2.5.29.diff/fs/Makefile Tue Jul 30 10:33:45 2002
@@ -8,14 +8,14 @@
O_TARGET := fs.o
export-objs := filesystems.o open.o dcache.o buffer.o bio.o inode.o dquot.o \
- mpage.o
+ mpage.o aio.o
obj-y := open.o read_write.o devices.o file_table.o buffer.o \
bio.o super.o block_dev.o char_dev.o stat.o exec.o pipe.o \
namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
filesystems.o namespace.o seq_file.o xattr.o libfs.o \
- fs-writeback.o mpage.o direct-io.o
+ fs-writeback.o mpage.o direct-io.o aio.o
ifneq ($(CONFIG_NFSD),n)
ifneq ($(CONFIG_NFSD),)
diff -urN v2.5.29/fs/aio.c aio-v2.5.29.diff/fs/aio.c
--- v2.5.29/fs/aio.c Wed Dec 31 19:00:00 1969
+++ aio-v2.5.29.diff/fs/aio.c Tue Jul 30 17:22:43 2002
@@ -0,0 +1,1160 @@
+/* fs/aio.c
+ * An async IO implementation for Linux
+ * Written by Benjamin LaHaise <[email protected]>
+ *
+ * Implements an efficient asynchronous io interface.
+ *
+ * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+//#define DEBUG 1
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/vmalloc.h>
+#include <linux/iobuf.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/brlock.h>
+#include <linux/aio.h>
+#include <linux/smp_lock.h>
+#include <linux/compiler.h>
+#include <linux/brlock.h>
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <linux/highmem.h>
+
+#if DEBUG > 1
+#define dprintk printk
+#else
+#define dprintk(x...) do { ; } while (0)
+#endif
+
+/*------ sysctl variables----*/
+atomic_t aio_nr = ATOMIC_INIT(0); /* current system wide number of aio requests */
+unsigned aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
+/*----end sysctl variables---*/
+
+static kmem_cache_t *kiocb_cachep;
+static kmem_cache_t *kioctx_cachep;
+
+/* Used for rare fput completion. */
+static void aio_fput_routine(void *);
+static struct tq_struct fput_tqueue = {
+ routine: aio_fput_routine,
+};
+
+static spinlock_t fput_lock = SPIN_LOCK_UNLOCKED;
+LIST_HEAD(fput_head);
+
+/* aio_setup
+ * Creates the slab caches used by the aio routines, panic on
+ * failure as this is done early during the boot sequence.
+ */
+static int __init aio_setup(void)
+{
+ kiocb_cachep = kmem_cache_create("kiocb", sizeof(struct kiocb),
+ 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (!kiocb_cachep)
+ panic("unable to create kiocb cache\n");
+
+ kioctx_cachep = kmem_cache_create("kioctx", sizeof(struct kioctx),
+ 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (!kioctx_cachep)
+ panic("unable to create kioctx cache");
+
+ printk(KERN_NOTICE "aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
+
+ return 0;
+}
+
+static void ioctx_free_reqs(struct kioctx *ctx)
+{
+ struct list_head *pos, *next;
+ list_for_each_safe(pos, next, &ctx->free_reqs) {
+ struct kiocb *iocb = list_kiocb(pos);
+ list_del(&iocb->ki_list);
+ kmem_cache_free(kiocb_cachep, iocb);
+ }
+}
+
+static void aio_free_ring(struct kioctx *ctx)
+{
+ struct aio_ring_info *info = &ctx->ring_info;
+ long i;
+
+ for (i=0; i<info->nr_pages; i++)
+ put_page(info->ring_pages[i]);
+
+ if (info->mmap_size) {
+ down_write(&ctx->mm->mmap_sem);
+ do_munmap(ctx->mm, info->mmap_base, info->mmap_size);
+ up_write(&ctx->mm->mmap_sem);
+ }
+
+ if (info->ring_pages && info->ring_pages != info->internal_pages)
+ kfree(info->ring_pages);
+ info->ring_pages = NULL;
+ info->nr = 0;
+}
+
+static int aio_setup_ring(struct kioctx *ctx)
+{
+ struct aio_ring *ring;
+ struct aio_ring_info *info = &ctx->ring_info;
+ unsigned nr_reqs = ctx->max_reqs;
+ unsigned long size;
+ int nr_pages;
+
+ /* Compensate for the ring buffer's head/tail overlap entry */
+ nr_reqs += 2; /* 1 is required, 2 for good luck */
+
+ size = sizeof(struct aio_ring);
+ size += sizeof(struct io_event) * nr_reqs;
+ nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
+
+ if (nr_pages < 0)
+ return -EINVAL;
+
+ info->nr_pages = nr_pages;
+
+ nr_reqs = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
+
+ info->nr = 0;
+ info->ring_pages = info->internal_pages;
+ if (nr_pages > AIO_RING_PAGES) {
+ info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL);
+ if (!info->ring_pages)
+ return -ENOMEM;
+ memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages);
+ }
+
+ info->mmap_size = nr_pages * PAGE_SIZE;
+ dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
+ down_write(&ctx->mm->mmap_sem);
+ info->mmap_base = do_mmap(NULL, 0, info->mmap_size,
+ PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE,
+ 0);
+ if (IS_ERR((void *)info->mmap_base)) {
+ up_write(&ctx->mm->mmap_sem);
+ printk("mmap err: %ld\n", -info->mmap_base);
+ info->mmap_size = 0;
+ aio_free_ring(ctx);
+ return -EAGAIN;
+ }
+
+ dprintk("mmap address: 0x%08lx\n", info->mmap_base);
+ info->nr_pages = get_user_pages(current, ctx->mm,
+ info->mmap_base, info->mmap_size,
+ 1, 0, info->ring_pages, NULL);
+ up_write(&ctx->mm->mmap_sem);
+
+ if (unlikely(info->nr_pages != nr_pages)) {
+ aio_free_ring(ctx);
+ return -EAGAIN;
+ }
+
+ ctx->user_id = info->mmap_base;
+
+ info->nr = nr_reqs; /* trusted copy */
+
+ ring = kmap_atomic(info->ring_pages[0], KM_USER0);
+ ring->nr = nr_reqs; /* user copy */
+ ring->id = ctx->user_id;
+ ring->head = ring->tail = 0;
+ ring->magic = AIO_RING_MAGIC;
+ ring->compat_features = AIO_RING_COMPAT_FEATURES;
+ ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
+ ring->header_length = sizeof(struct aio_ring);
+ kunmap_atomic(ring, KM_USER0);
+
+ return 0;
+}
+
+/* aio_ring_event: returns a pointer to the event at the given index from
+ * kmap_atomic(, km). Release the pointer with put_aio_ring_event();
+ */
+static inline struct io_event *aio_ring_event(struct aio_ring_info *info, int nr, enum km_type km)
+{
+ struct io_event *events;
+#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event))
+#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
+
+ if (nr < AIO_EVENTS_FIRST_PAGE) {
+ struct aio_ring *ring;
+ ring = kmap_atomic(info->ring_pages[0], km);
+ return &ring->io_events[nr];
+ }
+ nr -= AIO_EVENTS_FIRST_PAGE;
+
+ events = kmap_atomic(info->ring_pages[1 + nr / AIO_EVENTS_PER_PAGE], km);
+
+ return events + (nr % AIO_EVENTS_PER_PAGE);
+}
+
+static inline void put_aio_ring_event(struct io_event *event, enum km_type km)
+{
+ void *p = (void *)((unsigned long)event & PAGE_MASK);
+ kunmap_atomic(p, km);
+}
+
+/* ioctx_alloc
+ * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.
+ */
+static struct kioctx *ioctx_alloc(unsigned nr_reqs)
+{
+ struct mm_struct *mm;
+ struct kioctx *ctx;
+ unsigned i;
+
+ /* Prevent overflows */
+ if ((nr_reqs > (0x10000000U / sizeof(struct io_event))) ||
+ (nr_reqs > (0x10000000U / sizeof(struct kiocb)))) {
+ pr_debug("ENOMEM: nr_reqs too high\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (nr_reqs > aio_max_nr)
+ return ERR_PTR(-EAGAIN);
+
+ ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL);
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+
+ memset(ctx, 0, sizeof(*ctx));
+ ctx->max_reqs = nr_reqs;
+ mm = ctx->mm = current->mm;
+ atomic_inc(&mm->mm_count);
+
+ atomic_set(&ctx->users, 1);
+ spin_lock_init(&ctx->ctx_lock);
+ spin_lock_init(&ctx->ring_info.ring_lock);
+ init_waitqueue_head(&ctx->wait);
+
+ INIT_LIST_HEAD(&ctx->free_reqs);
+ INIT_LIST_HEAD(&ctx->active_reqs);
+
+ if (aio_setup_ring(ctx) < 0)
+ goto out_freectx;
+
+ /* Allocate nr_reqs iocbs for io. Free iocbs are on the
+ * ctx->free_reqs list. When active they migrate to the
+ * active_reqs list. During completion and cancellation
+ * the request may temporarily not be on any list.
+ */
+ for (i=0; i<nr_reqs; i++) {
+ struct kiocb *iocb = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
+ if (!iocb)
+ goto out_freering;
+ memset(iocb, 0, sizeof(*iocb));
+ iocb->ki_key = i;
+ iocb->ki_users = 0;
+ list_add(&iocb->ki_list, &ctx->free_reqs);
+ }
+
+ /* now link into global list. kludge. FIXME */
+ atomic_add(ctx->max_reqs, &aio_nr); /* undone by __put_ioctx */
+ if (unlikely(atomic_read(&aio_nr) > aio_max_nr))
+ goto out_cleanup;
+ write_lock(&mm->ioctx_list_lock);
+ ctx->next = mm->ioctx_list;
+ mm->ioctx_list = ctx;
+ write_unlock(&mm->ioctx_list_lock);
+
+ dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
+ ctx, ctx->user_id, current->mm, ctx->ring_info.ring->nr);
+ return ctx;
+
+out_cleanup:
+ atomic_sub(ctx->max_reqs, &aio_nr); /* undone by __put_ioctx */
+ ctx->max_reqs = 0; /* prevent __put_ioctx from sub'ing aio_nr */
+ __put_ioctx(ctx);
+ return ERR_PTR(-EAGAIN);
+
+out_freering:
+ aio_free_ring(ctx);
+ ioctx_free_reqs(ctx);
+out_freectx:
+ kmem_cache_free(kioctx_cachep, ctx);
+ ctx = ERR_PTR(-ENOMEM);
+
+ dprintk("aio: error allocating ioctx %p\n", ctx);
+ return ctx;
+}
+
+/* aio_cancel_all
+ * Cancels all outstanding aio requests on an aio context. Used
+ * when the processes owning a context have all exited to encourage
+ * the rapid destruction of the kioctx.
+ */
+static void aio_cancel_all(struct kioctx *ctx)
+{
+ int (*cancel)(struct kiocb *, struct io_event *);
+ struct io_event res;
+ spin_lock_irq(&ctx->ctx_lock);
+ ctx->dead = 1;
+ while (!list_empty(&ctx->active_reqs)) {
+ struct list_head *pos = ctx->active_reqs.next;
+ struct kiocb *iocb = list_kiocb(pos);
+ list_del_init(&iocb->ki_list);
+ cancel = iocb->ki_cancel;
+ if (cancel)
+ iocb->ki_users++;
+ spin_unlock_irq(&ctx->ctx_lock);
+ if (cancel)
+ cancel(iocb, &res);
+ spin_lock_irq(&ctx->ctx_lock);
+ }
+ spin_unlock_irq(&ctx->ctx_lock);
+}
+
+void wait_for_all_aios(struct kioctx *ctx)
+{
+ struct task_struct *tsk = current;
+ DECLARE_WAITQUEUE(wait, tsk);
+
+ if (!ctx->reqs_active)
+ return;
+
+ add_wait_queue(&ctx->wait, &wait);
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ while (ctx->reqs_active) {
+ printk("ctx->reqs_active = %d\n", ctx->reqs_active);
+ schedule();
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ }
+ set_task_state(tsk, TASK_RUNNING);
+ remove_wait_queue(&ctx->wait, &wait);
+}
+
+/* exit_aio: called when the last user of mm goes away. At this point,
+ * there is no way for any new requests to be submited or any of the
+ * io_* syscalls to be called on the context. However, there may be
+ * outstanding requests which hold references to the context; as they
+ * go away, they will call put_ioctx and release any pinned memory
+ * associated with the request (held via struct page * references).
+ */
+void exit_aio(struct mm_struct *mm)
+{
+ struct kioctx *ctx = mm->ioctx_list;
+ mm->ioctx_list = NULL;
+ while (ctx) {
+ struct kioctx *next = ctx->next;
+ ctx->next = NULL;
+ aio_cancel_all(ctx);
+
+ wait_for_all_aios(ctx);
+
+ if (1 != atomic_read(&ctx->users))
+ printk(KERN_DEBUG
+ "exit_aio:ioctx still alive: %d %d %d\n",
+ atomic_read(&ctx->users), ctx->dead,
+ ctx->reqs_active);
+ put_ioctx(ctx);
+ ctx = next;
+ }
+}
+
+/* __put_ioctx
+ * Called when the last user of an aio context has gone away,
+ * and the struct needs to be freed.
+ */
+void __put_ioctx(struct kioctx *ctx)
+{
+ unsigned nr_reqs = ctx->max_reqs;
+
+ if (unlikely(ctx->reqs_active))
+ BUG();
+
+ aio_free_ring(ctx);
+ mmdrop(ctx->mm);
+ ctx->mm = NULL;
+ pr_debug("__put_ioctx: freeing %p\n", ctx);
+ ioctx_free_reqs(ctx);
+ kmem_cache_free(kioctx_cachep, ctx);
+
+ atomic_sub(nr_reqs, &aio_nr);
+}
+
+/* aio_get_req
+ * Allocate a slot for an aio request. Increments the users count
+ * of the kioctx so that the kioctx stays around until all requests are
+ * complete. Returns -EAGAIN if no requests are free.
+ */
+static struct kiocb *FASTCALL(__aio_get_req(struct kioctx *ctx));
+static struct kiocb *__aio_get_req(struct kioctx *ctx)
+{
+ struct kiocb *req = NULL;
+ struct aio_ring *ring;
+
+ /* Check if the completion queue has enough free space to
+ * accept an event from this io.
+ */
+ spin_lock_irq(&ctx->ctx_lock);
+ ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0);
+ if (likely(!list_empty(&ctx->free_reqs) &&
+ (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)))) {
+ req = list_kiocb(ctx->free_reqs.next);
+ list_del(&req->ki_list);
+ list_add(&req->ki_list, &ctx->active_reqs);
+ ctx->reqs_active++;
+ req->ki_user_obj = NULL;
+ get_ioctx(ctx);
+
+ if (unlikely(req->ki_ctx != NULL))
+ BUG();
+ req->ki_ctx = ctx;
+ if (unlikely(req->ki_users))
+ BUG();
+ req->ki_users = 1;
+ }
+ kunmap_atomic(ring, KM_USER0);
+ spin_unlock_irq(&ctx->ctx_lock);
+
+ return req;
+}
+
+static inline struct kiocb *aio_get_req(struct kioctx *ctx)
+{
+ struct kiocb *req;
+ /* Handle a potential starvation case -- should be exceedingly rare as
+ * requests will be stuck on fput_head only if the aio_fput_routine is
+ * delayed and the requests were the last user of the struct file.
+ */
+ req = __aio_get_req(ctx);
+ if (unlikely(NULL == req)) {
+ aio_fput_routine(NULL);
+ req = __aio_get_req(ctx);
+ }
+ return req;
+}
+
+static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
+{
+ req->ki_ctx = NULL;
+ req->ki_filp = NULL;
+ req->ki_user_obj = NULL;
+ ctx->reqs_active--;
+ list_add(&req->ki_list, &ctx->free_reqs);
+
+ if (unlikely(!ctx->reqs_active && ctx->dead))
+ wake_up(&ctx->wait);
+}
+
+static void aio_fput_routine(void *data)
+{
+ spin_lock_irq(&fput_lock);
+ while (likely(!list_empty(&fput_head))) {
+ struct kiocb *req = list_kiocb(fput_head.next);
+ struct kioctx *ctx = req->ki_ctx;
+
+ list_del(&req->ki_list);
+ spin_unlock_irq(&fput_lock);
+
+ /* Complete the fput */
+ __fput(req->ki_filp);
+
+ /* Link the iocb into the context's free list */
+ spin_lock_irq(&ctx->ctx_lock);
+ really_put_req(ctx, req);
+ spin_unlock_irq(&ctx->ctx_lock);
+
+ put_ioctx(ctx);
+ spin_lock_irq(&fput_lock);
+ }
+ spin_unlock_irq(&fput_lock);
+}
+
+/* __aio_put_req
+ * Returns true if this put was the last user of the request.
+ */
+static inline int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
+{
+ dprintk(KERN_DEBUG "aio_put(%p): f_count=%d\n",
+ req, atomic_read(&req->ki_filp->f_count));
+
+ req->ki_users --;
+ if (unlikely(req->ki_users < 0))
+ BUG();
+ if (likely(req->ki_users))
+ return 0;
+ list_del(&req->ki_list); /* remove from active_reqs */
+ req->ki_cancel = NULL;
+
+ /* Must be done under the lock to serialise against cancellation.
+ * Call this aio_fput as it duplicates fput via the fput_tqueue.
+ */
+ if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) {
+ get_ioctx(ctx);
+ spin_lock(&fput_lock);
+ list_add(&req->ki_list, &fput_head);
+ spin_unlock(&fput_lock);
+ schedule_task(&fput_tqueue);
+ } else
+ really_put_req(ctx, req);
+ return 1;
+}
+
+/* aio_put_req
+ * Returns true if this put was the last user of the kiocb,
+ * false if the request is still in use.
+ */
+int aio_put_req(struct kiocb *req)
+{
+ struct kioctx *ctx = req->ki_ctx;
+ int ret;
+ spin_lock_irq(&ctx->ctx_lock);
+ ret = __aio_put_req(ctx, req);
+ spin_unlock_irq(&ctx->ctx_lock);
+ if (ret)
+ put_ioctx(ctx);
+ return ret;
+}
+
+/* Lookup an ioctx id. ioctx_list is lockless for reads.
+ * FIXME: this is O(n) and is only suitable for development.
+ */
+static inline struct kioctx *lookup_ioctx(unsigned long ctx_id)
+{
+ struct kioctx *ioctx;
+ struct mm_struct *mm;
+
+ mm = current->mm;
+ read_lock(&mm->ioctx_list_lock);
+ for (ioctx = mm->ioctx_list; ioctx; ioctx = ioctx->next)
+ if (likely(ioctx->user_id == ctx_id && !ioctx->dead)) {
+ get_ioctx(ioctx);
+ break;
+ }
+ read_unlock(&mm->ioctx_list_lock);
+
+ return ioctx;
+}
+
+/* aio_complete
+ * Called when the io request on the given iocb is complete.
+ * Returns true if this is the last user of the request. The
+ * only other user of the request can be the cancellation code.
+ */
+int aio_complete(struct kiocb *iocb, long res, long res2)
+{
+ struct kioctx *ctx = iocb->ki_ctx;
+ struct aio_ring_info *info = &ctx->ring_info;
+ struct aio_ring *ring;
+ struct io_event *event;
+ unsigned long flags;
+ unsigned long tail;
+ int ret;
+
+ /* add a completion event to the ring buffer.
+ * must be done holding ctx->ctx_lock to prevent
+ * other code from messing with the tail
+ * pointer since we might be called from irq
+ * context.
+ */
+ spin_lock_irqsave(&ctx->ctx_lock, flags);
+
+ ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
+
+ tail = info->tail;
+ event = aio_ring_event(info, tail, KM_IRQ0);
+ tail = (tail + 1) % info->nr;
+
+ event->obj = (u64)(unsigned long)iocb->ki_user_obj;
+ event->data = iocb->ki_user_data;
+ event->res = res;
+ event->res2 = res2;
+
+ dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n",
+ ctx, tail, iocb, iocb->ki_user_obj, iocb->ki_user_data,
+ res, res2);
+
+ /* after flagging the request as done, we
+ * must never even look at it again
+ */
+ barrier();
+
+ info->tail = tail;
+ ring->tail = tail;
+
+ wmb();
+ put_aio_ring_event(event, KM_IRQ0);
+ kunmap_atomic(ring, KM_IRQ1);
+
+ pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+
+ /* everything turned out well, dispose of the aiocb. */
+ ret = __aio_put_req(ctx, iocb);
+
+ spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+
+ if (waitqueue_active(&ctx->wait))
+ wake_up(&ctx->wait);
+
+ if (ret)
+ put_ioctx(ctx);
+
+ return ret;
+}
+
+/* aio_read_evt
+ * Pull an event off of the ioctx's event ring. Returns the number of
+ * events fetched (0 or 1 ;-)
+ * FIXME: make this use cmpxchg.
+ * TODO: make the ringbuffer user mmap()able (requires FIXME).
+ */
+static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
+{
+ struct aio_ring_info *info = &ioctx->ring_info;
+ struct aio_ring *ring;
+ unsigned long head;
+ int ret = 0;
+
+ ring = kmap_atomic(info->ring_pages[0], KM_USER0);
+ dprintk("in aio_read_evt h%lu t%lu m%lu\n",
+ (unsigned long)ring->head, (unsigned long)ring->tail,
+ (unsigned long)ring->nr);
+ barrier();
+ if (ring->head == ring->tail)
+ goto out;
+
+ spin_lock(&info->ring_lock);
+
+ head = ring->head % info->nr;
+ if (head != ring->tail) {
+ struct io_event *evp = aio_ring_event(info, head, KM_USER1);
+ *ent = *evp;
+ head = (head + 1) % info->nr;
+ barrier();
+ ring->head = head;
+ ret = 1;
+ put_aio_ring_event(evp, KM_USER1);
+ }
+ spin_unlock(&info->ring_lock);
+
+out:
+ kunmap_atomic(ring, KM_USER0);
+ dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret,
+ (unsigned long)ring->head, (unsigned long)ring->tail);
+ return ret;
+}
+
+struct timeout {
+ struct timer_list timer;
+ int timed_out;
+ struct task_struct *p;
+};
+
+static void timeout_func(unsigned long data)
+{
+ struct timeout *to = (struct timeout *)data;
+
+ to->timed_out = 1;
+ wake_up_process(to->p);
+}
+
+static inline void init_timeout(struct timeout *to)
+{
+ init_timer(&to->timer);
+ to->timer.data = (unsigned long)to;
+ to->timer.function = timeout_func;
+ to->timed_out = 0;
+ to->p = current;
+}
+
+static inline void ts_subtract_now(struct timespec *ts)
+{
+ struct timeval tv;
+ do_gettimeofday(&tv);
+ ts->tv_sec -= tv.tv_sec;
+ ts->tv_nsec -= tv.tv_usec * 1000;
+ if (ts->tv_nsec < 0) {
+ ts->tv_nsec += 1000000000;
+ ts->tv_sec --;
+ }
+}
+
+static inline void set_timeout(struct timeout *to, const struct timespec *ts)
+{
+ unsigned long how_long;
+
+ if (ts->tv_sec < 0 || (!ts->tv_sec && !ts->tv_nsec)) {
+ to->timed_out = 1;
+ return;
+ }
+
+ how_long = ts->tv_sec * HZ;
+#define HZ_NS (1000000000 / HZ)
+ how_long += (ts->tv_nsec + HZ_NS - 1) / HZ_NS;
+
+ to->timer.expires = jiffies + how_long;
+ add_timer(&to->timer);
+}
+
+static inline void clear_timeout(struct timeout *to)
+{
+ del_timer_sync(&to->timer);
+}
+
+static int read_events(struct kioctx *ctx, int nr, struct io_event *event,
+ const struct timespec *timeout)
+{
+ struct task_struct *tsk = current;
+ DECLARE_WAITQUEUE(wait, tsk);
+ int ret;
+ int i = 0;
+ struct io_event ent;
+ struct timeout to;
+
+ /* needed to zero any padding within an entry (there shouldn't be
+ * any, but C is fun!
+ */
+ memset(&ent, 0, sizeof(ent));
+ ret = 0;
+
+ while (likely(i < nr)) {
+ ret = aio_read_evt(ctx, &ent);
+ if (unlikely(ret <= 0))
+ break;
+
+ dprintk("read event: %Lx %Lx %Lx %Lx\n",
+ ent.data, ent.obj, ent.res, ent.res2);
+
+ /* Could we split the check in two? */
+ ret = -EFAULT;
+ if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
+ dprintk("aio: lost an event due to EFAULT.\n");
+ break;
+ }
+ ret = 0;
+
+ /* Good, event copied to userland, update counts. */
+ event ++;
+ i ++;
+ }
+
+ if (i)
+ return i;
+ if (ret)
+ return ret;
+
+ /* End fast path */
+
+ if (timeout) {
+ struct timespec ts;
+ ret = -EFAULT;
+ if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
+ goto out;
+
+ ts_subtract_now(&ts);
+ init_timeout(&to);
+ set_timeout(&to, &ts);
+ if (to.timed_out)
+ timeout = 0;
+ }
+
+ while (likely(i < nr)) {
+ add_wait_queue_exclusive(&ctx->wait, &wait);
+ do {
+ set_task_state(tsk, TASK_INTERRUPTIBLE);
+
+ ret = aio_read_evt(ctx, &ent);
+ if (ret)
+ break;
+ if (i)
+ break;
+ ret = 0;
+ if (to.timed_out) /* Only check after read evt */
+ break;
+ schedule();
+ if (signal_pending(tsk)) {
+ ret = -EINTR;
+ break;
+ }
+ /*ret = aio_read_evt(ctx, &ent);*/
+ } while (1) ;
+
+ set_task_state(tsk, TASK_RUNNING);
+ remove_wait_queue(&ctx->wait, &wait);
+
+ if (unlikely(ret <= 0))
+ break;
+
+ ret = -EFAULT;
+ if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
+ dprintk("aio: lost an event due to EFAULT.\n");
+ break;
+ }
+
+ /* Good, event copied to userland, update counts. */
+ event ++;
+ i ++;
+ }
+
+ if (timeout)
+ clear_timeout(&to);
+out:
+ return i ? i : ret;
+}
+
+/* Take an ioctx and remove it from the list of ioctx's. Protects
+ * against races with itself via ->dead.
+ */
+static void io_destroy(struct kioctx *ioctx)
+{
+ struct mm_struct *mm = current->mm;
+ struct kioctx **tmp;
+ int was_dead;
+
+ /* delete the entry from the list is someone else hasn't already */
+ write_lock(&mm->ioctx_list_lock);
+ was_dead = ioctx->dead;
+ ioctx->dead = 1;
+ for (tmp = &mm->ioctx_list; *tmp && *tmp != ioctx;
+ tmp = &(*tmp)->next)
+ ;
+ if (*tmp)
+ *tmp = ioctx->next;
+ write_unlock(&mm->ioctx_list_lock);
+
+ dprintk("aio_release(%p)\n", ioctx);
+ if (likely(!was_dead))
+ put_ioctx(ioctx); /* twice for the list */
+
+ aio_cancel_all(ioctx);
+ wait_for_all_aios(ioctx);
+ put_ioctx(ioctx); /* once for the lookup */
+}
+
+asmlinkage long sys_io_setup(unsigned nr_reqs, aio_context_t *ctxp)
+{
+ struct kioctx *ioctx = NULL;
+ unsigned long ctx;
+ long ret;
+
+ ret = get_user(ctx, ctxp);
+ if (unlikely(ret))
+ goto out;
+
+ ret = -EINVAL;
+ if (unlikely(ctx || !nr_reqs || (int)nr_reqs < 0)) {
+ pr_debug("EINVAL: io_setup: ctx or nr_reqs > max\n");
+ goto out;
+ }
+
+ ioctx = ioctx_alloc(nr_reqs);
+ ret = PTR_ERR(ioctx);
+ if (!IS_ERR(ioctx)) {
+ ret = put_user(ioctx->user_id, ctxp);
+ if (!ret)
+ return 0;
+ io_destroy(ioctx);
+ }
+
+out:
+ return ret;
+}
+
+/* aio_release
+ * Release the kioctx associated with the userspace handle.
+ */
+asmlinkage long sys_io_destroy(aio_context_t ctx)
+{
+ struct kioctx *ioctx = lookup_ioctx(ctx);
+ if (likely(NULL != ioctx)) {
+ io_destroy(ioctx);
+ return 0;
+ }
+ pr_debug("EINVAL: io_destroy: invalid context id\n");
+ return -EINVAL;
+}
+
+static int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,
+ struct iocb *iocb));
+static int io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,
+ struct iocb *iocb)
+{
+ struct kiocb *req;
+ struct file *file;
+ ssize_t ret;
+ char *buf;
+
+ /* enforce forwards compatibility on users */
+ if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2 ||
+ iocb->aio_reserved3)) {
+ pr_debug("EINVAL: io_submit: reserve field set\n");
+ return -EINVAL;
+ }
+
+ /* prevent overflows */
+ if (unlikely(
+ (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
+ (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
+ ((ssize_t)iocb->aio_nbytes < 0)
+ )) {
+ pr_debug("EINVAL: io_submit: overflow check\n");
+ return -EINVAL;
+ }
+
+ file = fget(iocb->aio_fildes);
+ if (unlikely(!file))
+ return -EBADF;
+
+ req = aio_get_req(ctx);
+ if (unlikely(!req)) {
+ fput(file);
+ return -EAGAIN;
+ }
+
+ req->ki_filp = file;
+ iocb->aio_key = req->ki_key;
+ ret = put_user(iocb->aio_key, &user_iocb->aio_key);
+ if (unlikely(ret)) {
+ dprintk("EFAULT: aio_key\n");
+ goto out_put_req;
+ }
+
+ req->ki_user_obj = user_iocb;
+ req->ki_user_data = iocb->aio_data;
+
+ buf = (char *)(unsigned long)iocb->aio_buf;
+
+ switch (iocb->aio_lio_opcode) {
+ case IOCB_CMD_PREAD:
+ ret = -EBADF;
+ if (unlikely(!(file->f_mode & FMODE_READ)))
+ goto out_put_req;
+ ret = -EFAULT;
+ if (unlikely(!access_ok(VERIFY_WRITE, buf, iocb->aio_nbytes)))
+ goto out_put_req;
+ ret = -EINVAL;
+ if (file->f_op->aio_write)
+ ret = file->f_op->aio_read(req, buf,
+ iocb->aio_nbytes, iocb->aio_offset);
+ break;
+ case IOCB_CMD_PWRITE:
+ ret = -EBADF;
+ if (unlikely(!(file->f_mode & FMODE_WRITE)))
+ goto out_put_req;
+ ret = -EFAULT;
+ if (unlikely(!access_ok(VERIFY_READ, buf, iocb->aio_nbytes)))
+ goto out_put_req;
+ ret = -EINVAL;
+ if (file->f_op->aio_write)
+ ret = file->f_op->aio_write(req, buf,
+ iocb->aio_nbytes, iocb->aio_offset);
+ break;
+ case IOCB_CMD_FDSYNC:
+ ret = -EINVAL;
+ if (file->f_op->aio_fsync)
+ ret = file->f_op->aio_fsync(req, 1);
+ break;
+ case IOCB_CMD_FSYNC:
+ ret = -EINVAL;
+ if (file->f_op->aio_fsync)
+ ret = file->f_op->aio_fsync(req, 0);
+ break;
+ default:
+ dprintk("EINVAL: io_submit: no operation provided\n");
+ ret = -EINVAL;
+ }
+
+ if (likely(EIOCBQUEUED == ret))
+ return 0;
+ if (ret >= 0) {
+ aio_complete(req, ret, 0);
+ return 0;
+ }
+
+out_put_req:
+ aio_put_req(req);
+ return ret;
+}
+
+/* sys_io_submit
+ * Copy an aiocb from userspace into kernel space, then convert it to
+ * a kiocb, submit and repeat until done. Error codes on copy/submit
+ * only get returned for the first aiocb copied as otherwise the size
+ * of aiocbs copied is returned (standard write sematics).
+ */
+asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr, struct iocb **iocbpp)
+{
+ struct kioctx *ctx;
+ long ret = 0;
+ int i;
+
+ if (unlikely(nr < 0))
+ return -EINVAL;
+
+ if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
+ return -EFAULT;
+
+ ctx = lookup_ioctx(ctx_id);
+ if (unlikely(!ctx)) {
+ pr_debug("EINVAL: io_submit: invalid context id\n");
+ return -EINVAL;
+ }
+
+ for (i=0; i<nr; i++) {
+ struct iocb *user_iocb, tmp;
+
+ if (unlikely(__get_user(user_iocb, iocbpp + i))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ if (unlikely(__copy_from_user(&tmp, user_iocb, sizeof(tmp)))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = io_submit_one(ctx, user_iocb, &tmp);
+ if (ret)
+ break;
+ }
+
+ put_ioctx(ctx);
+ return i ? i : ret;
+}
+
+/* lookup_kiocb
+ * Finds a given iocb for cancellation.
+ * MUST be called with ctx->ctx_lock held.
+ */
+struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb *iocb, u32 key)
+{
+ struct list_head *pos;
+ /* TODO: use a hash or array, this sucks. */
+ list_for_each(pos, &ctx->free_reqs) {
+ struct kiocb *kiocb = list_kiocb(pos);
+ if (kiocb->ki_user_obj == iocb && kiocb->ki_key == key)
+ return kiocb;
+ }
+ return NULL;
+}
+
+/* sys_io_cancel
+ * Cancels the io previously submitted via iocb. If successful,
+ * returns 0 and places the resulting event in res. Otherwise,
+ * return -somerror.
+ */
+asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb *iocb,
+ struct io_event *u_res)
+{
+ int (*cancel)(struct kiocb *iocb, struct io_event *res);
+ struct kioctx *ctx;
+ struct kiocb *kiocb;
+ struct io_event result;
+ u32 key;
+ int ret;
+
+ ret = get_user(key, &iocb->aio_key);
+ if (unlikely(ret))
+ return -EFAULT;
+
+ ctx = lookup_ioctx(ctx_id);
+ if (unlikely(!ctx))
+ return -EINVAL;
+
+ spin_lock_irq(&ctx->ctx_lock);
+ ret = -EAGAIN;
+ kiocb = lookup_kiocb(ctx, iocb, key);
+ if (kiocb && kiocb->ki_cancel) {
+ cancel = kiocb->ki_cancel;
+ kiocb->ki_users ++;
+ } else
+ cancel = NULL;
+ spin_unlock_irq(&ctx->ctx_lock);
+
+ if (NULL != cancel) {
+ printk("calling cancel\n");
+ ret = cancel(kiocb, &result);
+ if (!ret) {
+ /* Cancellation succeeded -- copy the result
+ * into the user's buffer.
+ */
+ if (copy_to_user(u_res, &result, sizeof(result)))
+ ret = -EFAULT;
+ }
+ } else
+ printk(KERN_DEBUG "iocb has no cancel operation\n");
+
+ put_ioctx(ctx);
+
+ return ret;
+}
+
+/* sys_io_getevnts:
+ * Reads at most nr completion events into the array pointed to by
+ * events from the io context ctx_id.
+ */
+asmlinkage long sys_io_getevents_abs(aio_context_t ctx_id,
+ long nr,
+ struct io_event *events,
+ const struct timespec *when)
+{
+ struct kioctx *ioctx = lookup_ioctx(ctx_id);
+ long ret = -EINVAL;
+
+ if (likely(NULL != ioctx)) {
+ ret = read_events(ioctx, nr, events, when);
+ put_ioctx(ioctx);
+ }
+
+ return ret;
+}
+
+/* vsys_io_getevents: runs in userspace to fetch what io events are
+ * available.
+ */
+#if 0
+__attribute__((section(".vsyscall_text")))
+asmlinkage long vsys_io_getevents(aio_context_t ctx_id,
+ long nr,
+ struct io_event *events,
+ const struct timespec *when)
+{
+ struct aio_ring *ring = (struct aio_ring *)ctx_id;
+ long i = 0;
+
+ while (i < nr) {
+ unsigned head;
+
+ head = ring->head;
+ if (head == ring->tail)
+ break;
+
+ *events++ = ring->io_events[head];
+ head = (head + 1) % ring->nr;
+ ring->head = head;
+ i++;
+ }
+
+ if (i)
+ return i;
+ return vsys_io_getevents_slow(ctx_id, nr, events, when);
+}
+#endif
+
+__initcall(aio_setup);
+
+EXPORT_SYMBOL(aio_complete);
+EXPORT_SYMBOL(aio_put_req);
diff -urN v2.5.29/include/asm-i386/kmap_types.h aio-v2.5.29.diff/include/asm-i386/kmap_types.h
--- v2.5.29/include/asm-i386/kmap_types.h Tue Jun 18 23:22:22 2002
+++ aio-v2.5.29.diff/include/asm-i386/kmap_types.h Tue Jul 30 10:38:30 2002
@@ -19,7 +19,9 @@
D(6) KM_BIO_DST_IRQ,
D(7) KM_PTE0,
D(8) KM_PTE1,
-D(9) KM_TYPE_NR
+D(9) KM_IRQ0,
+D(10) KM_IRQ1,
+D(11) KM_TYPE_NR
};
#undef D
diff -urN v2.5.29/include/asm-i386/unistd.h aio-v2.5.29.diff/include/asm-i386/unistd.h
--- v2.5.29/include/asm-i386/unistd.h Thu Jun 6 00:35:32 2002
+++ aio-v2.5.29.diff/include/asm-i386/unistd.h Tue Jul 30 17:22:47 2002
@@ -247,6 +247,12 @@
#define __NR_futex 240
#define __NR_sched_setaffinity 241
#define __NR_sched_getaffinity 242
+#define __NR_set_thread_area 243
+#define __NR_io_setup 244
+#define __NR_io_destroy 245
+#define __NR_io_getevents_abs 246
+#define __NR_io_submit 247
+#define __NR_io_cancel 248
/* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
diff -urN v2.5.29/include/linux/aio.h aio-v2.5.29.diff/include/linux/aio.h
--- v2.5.29/include/linux/aio.h Wed Dec 31 19:00:00 1969
+++ aio-v2.5.29.diff/include/linux/aio.h Tue Jul 30 17:40:17 2002
@@ -0,0 +1,118 @@
+#ifndef __LINUX__AIO_H
+#define __LINUX__AIO_H
+
+#include <linux/tqueue.h>
+#include <linux/list.h>
+#include <asm/atomic.h>
+
+#include <linux/aio_abi.h>
+
+#define AIO_MAXSEGS 4
+#define AIO_KIOGRP_NR_ATOMIC 8
+
+struct kioctx;
+
+/* Notes on cancelling a kiocb:
+ * If a kiocb is cancelled, aio_complete may return 0 to indicate
+ * that cancel has not yet disposed of the kiocb. All cancel
+ * operations *must* call aio_put_req to dispose of the kiocb
+ * to guard against races with the completion code.
+ */
+#define KIOCB_C_CANCELLED 0x01
+#define KIOCB_C_COMPLETE 0x02
+
+struct kiocb {
+ struct list_head ki_list;
+
+ struct file *ki_filp;
+ void *ki_data; /* for use by the the file */
+
+ struct kioctx *ki_ctx;
+ int ki_users;
+
+ void *ki_user_obj;
+ __u64 ki_user_data;
+
+ unsigned ki_key; /* id of this request */
+ int (*ki_cancel)(struct kiocb *, struct io_event *);
+};
+
+#define AIO_RING_MAGIC 0xa10a10a1
+#define AIO_RING_COMPAT_FEATURES 1
+#define AIO_RING_INCOMPAT_FEATURES 0
+struct aio_ring {
+ unsigned id; /* kernel internal index number */
+ unsigned nr; /* number of io_events */
+ unsigned head;
+ unsigned tail;
+
+ unsigned magic;
+ unsigned compat_features;
+ unsigned incompat_features;
+ unsigned header_length; /* size of aio_ring */
+
+
+ struct io_event io_events[0];
+}; /* 128 bytes + ring size */
+
+#define aio_ring_avail(info, ring) (((ring)->head + (info)->nr - 1 - (ring)->tail) % (info)->nr)
+
+#define AIO_RING_PAGES 8
+struct aio_ring_info {
+ unsigned long mmap_base;
+ unsigned long mmap_size;
+
+ struct page **ring_pages;
+ spinlock_t ring_lock;
+ long nr_pages;
+
+ unsigned nr, tail;
+
+ struct page *internal_pages[AIO_RING_PAGES];
+};
+
+struct kioctx {
+ atomic_t users;
+ int dead;
+ struct mm_struct *mm;
+
+ /* This needs improving */
+ unsigned long user_id;
+ struct kioctx *next;
+
+ wait_queue_head_t wait;
+
+ spinlock_t ctx_lock;
+
+ int reqs_active;
+ struct list_head free_reqs;
+ struct list_head active_reqs; /* used for cancellation */
+
+ unsigned max_reqs;
+
+ struct aio_ring_info ring_info;
+};
+
+/* prototypes */
+extern unsigned aio_max_size;
+
+extern int FASTCALL(aio_put_req(struct kiocb *iocb));
+extern int FASTCALL(aio_complete(struct kiocb *iocb, long res, long res2));
+extern void FASTCALL(__put_ioctx(struct kioctx *ctx));
+struct mm_struct;
+extern void FASTCALL(exit_aio(struct mm_struct *mm));
+
+#define get_ioctx(kioctx) do { if (unlikely(atomic_read(&(kioctx)->users) <= 0)) BUG(); atomic_inc(&(kioctx)->users); } while (0)
+#define put_ioctx(kioctx) do { if (unlikely(atomic_dec_and_test(&(kioctx)->users))) __put_ioctx(kioctx); else if (unlikely(atomic_read(&(kioctx)->users) < 0)) BUG(); } while (0)
+
+#include <linux/aio_abi.h>
+
+static inline struct kiocb *list_kiocb(struct list_head *h)
+{
+ return list_entry(h, struct kiocb, ki_list);
+}
+
+/* for sysctl: */
+extern unsigned aio_max_nr, aio_max_size, aio_max_pinned;
+
+#endif /* __LINUX__AIO_H */
diff -urN v2.5.29/include/linux/aio_abi.h aio-v2.5.29.diff/include/linux/aio_abi.h
--- v2.5.29/include/linux/aio_abi.h Wed Dec 31 19:00:00 1969
+++ aio-v2.5.29.diff/include/linux/aio_abi.h Tue Jul 30 17:37:40 2002
@@ -0,0 +1,89 @@
+/* linux/aio_abi.h
+ *
+ * Copyright 2000,2001,2002 Red Hat.
+ *
+ * Written by Benjamin LaHaise <[email protected]>
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation is hereby granted, provided that the above copyright
+ * notice appears in all copies. This software is provided without any
+ * warranty, express or implied. Red Hat makes no representations about
+ * the suitability of this software for any purpose.
+ *
+ * IN NO EVENT SHALL RED HAT BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
+ * SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF
+ * THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF RED HAT HAS BEEN ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * RED HAT DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND
+ * RED HAT HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+ * ENHANCEMENTS, OR MODIFICATIONS.
+ */
+#ifndef __LINUX__AIO_ABI_H
+#define __LINUX__AIO_ABI_H
+
+#include <asm/byteorder.h>
+
+typedef unsigned long aio_context_t;
+
+enum {
+ IOCB_CMD_PREAD = 0,
+ IOCB_CMD_PWRITE = 1,
+ IOCB_CMD_FSYNC = 2,
+ IOCB_CMD_FDSYNC = 3,
+ /* These two are experimental.
+ * IOCB_CMD_PREADX = 4,
+ * IOCB_CMD_POLL = 5,
+ */
+ IOCB_CMD_NOOP = 6,
+};
+
+/* read() from /dev/aio returns these structures. */
+struct io_event {
+ __u64 data; /* the data field from the iocb */
+ __u64 obj; /* what iocb this event came from */
+ __s64 res; /* result code for this event */
+ __s64 res2; /* secondary result */
+};
+
+#if defined(__LITTLE_ENDIAN)
+#define PADDED(x,y) x, y
+#elif defined(__BIG_ENDIAN)
+#define PADDED(x,y) y, x
+#else
+#error edit for your odd byteorder.
+#endif
+
+/*
+ * we always use a 64bit off_t when communicating
+ * with userland. its up to libraries to do the
+ * proper padding and aio_error abstraction
+ */
+
+struct iocb {
+ /* these are internal to the kernel/libc. */
+ __u64 aio_data; /* data to be returned in event's data */
+ __u32 PADDED(aio_key, aio_reserved1);
+ /* the kernel sets aio_key to the req # */
+
+ /* common fields */
+ __u16 aio_lio_opcode; /* see IOCB_CMD_ above */
+ __s16 aio_reqprio;
+ __u32 aio_fildes;
+
+ __u64 aio_buf;
+ __u64 aio_nbytes;
+ __s64 aio_offset;
+
+ /* extra parameters */
+ __u64 aio_reserved2; /* TODO: use this for a (struct sigevent *) */
+ __u64 aio_reserved3;
+}; /* 64 bytes */
+
+#undef IFBIG
+#undef IFLITTLE
+
+#endif /* __LINUX__AIO_ABI_H */
+
diff -urN v2.5.29/include/linux/errno.h aio-v2.5.29.diff/include/linux/errno.h
--- v2.5.29/include/linux/errno.h Fri Feb 9 17:46:13 2001
+++ aio-v2.5.29.diff/include/linux/errno.h Tue Jul 30 14:36:09 2002
@@ -10,6 +10,7 @@
#define ERESTARTNOINTR 513
#define ERESTARTNOHAND 514 /* restart if no handler.. */
#define ENOIOCTLCMD 515 /* No ioctl command */
+#define EIOCBQUEUED 516 /* Async operation is queued. */
/* Defined for the NFSv3 protocol */
#define EBADHANDLE 521 /* Illegal NFS file handle */
diff -urN v2.5.29/include/linux/fs.h aio-v2.5.29.diff/include/linux/fs.h
--- v2.5.29/include/linux/fs.h Tue Jul 30 10:24:33 2002
+++ aio-v2.5.29.diff/include/linux/fs.h Tue Jul 30 10:32:38 2002
@@ -740,6 +740,7 @@
* read, write, poll, fsync, readv, writev can be called
* without the big kernel lock held in all filesystems.
*/
+struct kiocb;
struct file_operations {
struct module *owner;
loff_t (*llseek) (struct file *, loff_t, int);
@@ -759,6 +760,12 @@
ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+
+ ssize_t (*aio_read)(struct kiocb *, char *, size_t, loff_t);
+ ssize_t (*aio_write)(struct kiocb *, const char *, size_t, loff_t);
+ int (*aio_fsync)(struct kiocb *, int datasync);
+
+ struct kmem_cache_s *kiocb_slab;
};
struct inode_operations {
diff -urN v2.5.29/include/linux/sched.h aio-v2.5.29.diff/include/linux/sched.h
--- v2.5.29/include/linux/sched.h Tue Jul 30 10:24:33 2002
+++ aio-v2.5.29.diff/include/linux/sched.h Tue Jul 30 10:32:38 2002
@@ -166,6 +166,7 @@
/* Maximum number of active map areas.. This is a random (large) number */
#define MAX_MAP_COUNT (65536)
+struct kioctx;
struct mm_struct {
struct vm_area_struct * mmap; /* list of VMAs */
rb_root_t mm_rb;
@@ -194,6 +195,11 @@
/* Architecture-specific MM context */
mm_context_t context;
+
+ /* aio bits that have to be shared between threads */
+ rwlock_t ioctx_list_lock;
+ struct kioctx *ioctx_list;
+ unsigned long new_ioctx_id;
};
extern int mmlist_nr;
diff -urN v2.5.29/kernel/fork.c aio-v2.5.29.diff/kernel/fork.c
--- v2.5.29/kernel/fork.c Tue Jul 30 10:24:19 2002
+++ aio-v2.5.29.diff/kernel/fork.c Tue Jul 30 12:13:00 2002
@@ -25,6 +25,7 @@
#include <linux/binfmts.h>
#include <linux/fs.h>
#include <linux/security.h>
+#include <linux/aio.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -265,6 +266,7 @@
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
mm->page_table_lock = SPIN_LOCK_UNLOCKED;
+ rwlock_init(&mm->ioctx_list_lock);
mm->pgd = pgd_alloc(mm);
if (mm->pgd)
return mm;
@@ -310,6 +312,12 @@
list_del(&mm->mmlist);
mmlist_nr--;
spin_unlock(&mmlist_lock);
+ exit_aio(mm); /* This is partially wrong: it should be called
+ * when the last thread in a group exits to
+ * block exit until all IOs are cancelled.
+ * Here, we could block some random /proc user
+ * instead. Yuck. FIXME.
+ */
exit_mmap(mm);
mmdrop(mm);
}
On Tue, Jul 30, 2002 at 05:54:21PM -0400, Benjamin LaHaise wrote:
> On Tue, Jul 30, 2002 at 11:41:16PM +0200, Andrea Arcangeli wrote:
> > Can you point me out to a patch with the new cancellation API that you
> > agree with for merging in 2.5 so I can synchronize? I'm reading your
> > very latest patch loaded on some site in June. that will be really
> > helpful, many thanks!
>
> Here is what I've got for the aio core that has the cancellation
> change to return the completion event. The other slight change that
> I meant to get in before going into the mainstream is to have the
> timeout io_getevents takes be an absolute timeout, which helps for
> applications that have specific deadlines they are attempting to
> schedule to (think video playback). This drop is untested, but I'd
are you sure this is a good idea? this adds an implicit gettimeofday
(thought no entry/exit kernel) to every getevents syscall with a
"when" specificed, so the user may now need to do gettimeofday both
externally and internally to use the previous "timeout" feature (given
the kernel can delay only of a timeout, so the kernel has to calculate
the timeout internally now). I guess I prefer the previous version that
had the "timeout" information instead of "when". Also many soft
multimedia only expect the timeout to take "timeout", and if a frame
skips they'll just slowdown the frame rate, so they won't be real time
but you'll see something on the screen/audio. Otherwise they can keep
timing out endlessy if they cannot keep up with the stream, and they
will show nothing rather than showing a low frame rate.
So I'm not very excited about this change, I would prefer the previous
version. Also consider with the vsyscall doing the gettimeofday
calculation in userspace based on "when" rather than in-kernel isn't
going to be more expensive than your new API even of applications that
really want the "when" behaviour instead of the "timeout". While the
applications that wants the "timeout" this way we'll be forced to a
vgettimeofday in userspace and one in kernel which is a pure overhead
for them.
So unless anybody can see a flaw in my reasoning, I would suggest you to
backout the "when" change and to resend to Linus.
Also the vsyscall sections would better be deleted instead of under #if
0.
Everything else looks great, thanks!
Andrea
On Tue, 30 Jul 2002, Andrea Arcangeli wrote:
> On Tue, Jul 30, 2002 at 08:49:39AM -0400, Benjamin LaHaise wrote:
> > On Tue, Jul 30, 2002 at 07:41:11AM +0200, Andrea Arcangeli wrote:
> > What would you suggest as an alternative API? The main point of multiplexing
> > is that ios can be submitted in batches, which can't be done if the ios are
> > submitted via individual syscalls, not to mention the overlap with the posix
> > aio api.
>
> yes, sys_io_sumbit has the advantage you can mix read/write/fsync etc..
> in the same array of iocb. But by the same argument we could as well
> have a submit_io instead of sys_read/sys_write/sys_fsync.
You can't batch synchronous requests, so your "by the same
argument" doesn't work.
Asynchronous requests, OTOH, could be submitted in large
bundles since the app doesn't wait on each request.
regards,
Rik
--
Bravely reimplemented by the knights who say "NIH".
http://www.surriel.com/ http://distro.conectiva.com/
On Tue, Jul 30, 2002 at 10:20:51PM -0300, Rik van Riel wrote:
> On Tue, 30 Jul 2002, Andrea Arcangeli wrote:
> > On Tue, Jul 30, 2002 at 08:49:39AM -0400, Benjamin LaHaise wrote:
> > > On Tue, Jul 30, 2002 at 07:41:11AM +0200, Andrea Arcangeli wrote:
>
> > > What would you suggest as an alternative API? The main point of multiplexing
> > > is that ios can be submitted in batches, which can't be done if the ios are
> > > submitted via individual syscalls, not to mention the overlap with the posix
> > > aio api.
> >
> > yes, sys_io_sumbit has the advantage you can mix read/write/fsync etc..
> > in the same array of iocb. But by the same argument we could as well
> > have a submit_io instead of sys_read/sys_write/sys_fsync.
>
> You can't batch synchronous requests, so your "by the same
> argument" doesn't work.
>
> Asynchronous requests, OTOH, could be submitted in large
> bundles since the app doesn't wait on each request.
disagree, merging synchronous requests would make much more sense than
merging asynchronous requests in the same syscall, it would make them
asynchronous with respect than each other without losing their global
synchronous behaviour w.r.t. userspace.
With async-io it doesn't matter at all to merge too much stuff (except to
avoid entering/exiting kernel that applies to synchronous operations
too).
Andrea
On Wed, Jul 31, 2002 at 03:32:38AM +0200, Andrea Arcangeli wrote:
> disagree, merging synchronous requests would make much more sense than
> merging asynchronous requests in the same syscall, it would make them
> asynchronous with respect than each other without losing their global
> synchronous behaviour w.r.t. userspace.
readv/writev..
On Wed, Jul 31, 2002 at 09:25:27AM +0100, Christoph Hellwig wrote:
> On Wed, Jul 31, 2002 at 03:32:38AM +0200, Andrea Arcangeli wrote:
> > disagree, merging synchronous requests would make much more sense than
> > merging asynchronous requests in the same syscall, it would make them
> > asynchronous with respect than each other without losing their global
> > synchronous behaviour w.r.t. userspace.
>
> readv/writev..
exactly, that's the same concept even if it cannot intermix read,
writes, fsyncs and polls in the same call :).
Andrea
On Wed, Jul 31, 2002 at 02:44:51AM +0200, Andrea Arcangeli wrote:
> So I'm not very excited about this change, I would prefer the previous
> version. Also consider with the vsyscall doing the gettimeofday
> calculation in userspace based on "when" rather than in-kernel isn't
> going to be more expensive than your new API even of applications that
> really want the "when" behaviour instead of the "timeout". While the
> applications that wants the "timeout" this way we'll be forced to a
> vgettimeofday in userspace and one in kernel which is a pure overhead
> for them.
That's still racy. There are several hundred instructions from the
time the timeout is calculated until the kernel actually uses the
timeout to calculate an offset relative to jiffies, during which a
task switch may occur. I suppose that this could be handled via a
separate timer interface (we should probably implement posix timers
anyways). I can see the arguments, and I guess it's easier to just
revert it.
-ben
--
"You will be reincarnated as a toad; and you will be much happier."
Andrea Arcangeli <[email protected]> writes:
> are you sure this is a good idea? this adds an implicit gettimeofday
> (thought no entry/exit kernel) to every getevents syscall with a
> "when" specificed, so the user may now need to do gettimeofday both
> externally and internally to use the previous "timeout" feature (given
> the kernel can delay only of a timeout, so the kernel has to calculate
> the timeout internally now). I guess I prefer the previous version that
> had the "timeout" information instead of "when". Also many soft
> multimedia only expect the timeout to take "timeout", and if a frame
> skips they'll just slowdown the frame rate, so they won't be real time
> but you'll see something on the screen/audio. Otherwise they can keep
> timing out endlessy if they cannot keep up with the stream, and they
> will show nothing rather than showing a low frame rate.
I disagree. If for some reason the multimedia player can not keep up,
there will be corresponding changes to subsequent requested timeouts.
For example, the pattern of future timeouts will reflect the new lower
frame rate (e.g. timeout after 1/15 s instead of 1/30 s). (BTW: I've
written adaptive media players, so I'm speaking from experience).
How repulsive would it be to add a boolean parameter that indicates
whether the supplied timeout value is relative or absolute?
-- Buck
Hi!
> > > Can you point me out to a patch with the new cancellation API that you
> > > agree with for merging in 2.5 so I can synchronize? I'm reading your
> > > very latest patch loaded on some site in June. that will be really
> > > helpful, many thanks!
> >
> > Here is what I've got for the aio core that has the cancellation
> > change to return the completion event. The other slight change that
> > I meant to get in before going into the mainstream is to have the
> > timeout io_getevents takes be an absolute timeout, which helps for
> > applications that have specific deadlines they are attempting to
> > schedule to (think video playback). This drop is untested, but I'd
>
> are you sure this is a good idea? this adds an implicit gettimeofday
> (thought no entry/exit kernel) to every getevents syscall with a
> "when" specificed, so the user may now need to do gettimeofday both
> externally and internally to use the previous "timeout" feature (given
> the kernel can delay only of a timeout, so the kernel has to calculate
> the timeout internally now). I guess I prefer the previous version that
> had the "timeout" information instead of "when". Also many soft
> multimedia only expect the timeout to take "timeout", and if a frame
> skips they'll just slowdown the frame rate, so they won't be real time
> but you'll see something on the screen/audio. Otherwise they can keep
> timing out endlessy if they cannot keep up with the stream, and they
> will show nothing rather than showing a low frame rate.
>
> So I'm not very excited about this change, I would prefer the previous
> version. Also consider with the vsyscall doing the gettimeofday
> calculation in userspace based on "when" rather than in-kernel isn't
> going to be more expensive than your new API even of applications that
> really want the "when" behaviour instead of the "timeout". While the
> applications that wants the "timeout" this way we'll be forced to a
> vgettimeofday in userspace and one in kernel which is a pure overhead
> for them.
I believe Linus actually explained why "when" looks way better to him
than "timeout". [It does not skew, for example.]
Pavel
--
I'm [email protected]. "In my country we have almost anarchy and I don't care."
Panos Katsaloulis describing me w.r.t. patents at [email protected]
On Thu, Aug 01, 2002 at 12:30:11PM +0200, Pavel Machek wrote:
> I believe Linus actually explained why "when" looks way better to him
> than "timeout". [It does not skew, for example.]
After thinking about it further, there is one problem with when that is
avoided with timeout: if the system time is changed between the timeout
calculation and the time the kernel calculates the jiffies offset, the
process could be delayed much longer than desired (and fixing this case
is hard enough that it should be avoided in typical code). Tradeoffs...
-ben
--
"You will be reincarnated as a toad; and you will be much happier."
Benjamin LaHaise wrote:
> After thinking about it further, there is one problem with when that is
> avoided with timeout: if the system time is changed between the timeout
> calculation and the time the kernel calculates the jiffies offset, the
> process could be delayed much longer than desired (and fixing this case
> is hard enough that it should be avoided in typical code). Tradeoffs...
Now if we had a constant monotonic source of time--say 64-bit nanoseconds since
boot--this wouldn't be a problem.
Chris
--
Chris Friesen | MailStop: 043/33/F10
Nortel Networks | work: (613) 765-0557
3500 Carling Avenue | fax: (613) 765-2986
Nepean, ON K2H 8E9 Canada | email: [email protected]
On Thu, 1 Aug 2002, Chris Friesen wrote:
>
> Now if we had a constant monotonic source of time--say 64-bit nanoseconds since
> boot--this wouldn't be a problem.
Well, we do have such a monotonic time sequence already, and that's the
one that the kernel always uses internally.
It's called "jiffies64".
However, "jiffies" are not really real time, they are only a "reasonable
abstraction thereof", and while they imply ordering ("time_after()" works
fine inside the kernel), they do _not_ imply real time.
In other words, there is no way to move from time -> jiffies and back.
But we could certainly export jiffies64 as a "nanosecond-like" thing. All
it takes is one 32x64-bit multiply. It won't be "true nanoseconds", but it
will be a "reasonable approximation" (ie the rate may be off by several
percentage points, since nothing is correcting for it. But the "no
correction" is part of the _advantage_ too).
Linus
On Thu, 2002-08-01 at 17:09, Linus Torvalds wrote:
> However, "jiffies" are not really real time, they are only a "reasonable
> abstraction thereof", and while they imply ordering ("time_after()" works
> fine inside the kernel), they do _not_ imply real time.
>
> In other words, there is no way to move from time -> jiffies and back.
For a lot of applications like multimedia you actually want a counting
of time not any relation to real time except that you can tell how many
ticks elapse a second.
On 1 Aug 2002, Alan Cox wrote:
>
> For a lot of applications like multimedia you actually want a counting
> of time not any relation to real time except that you can tell how many
> ticks elapse a second.
Absolutely. I think "jiffies64" is fine (as long as is it converted to
some "standard" time-measure like microseconds or nanoseconds so that
people don't have to care about internal kernel state) per se.
The only thing that I think makes it less than wonderful is really the
fact that we cannot give an accurate measure for it. We can _say_ that
what we count in microseconds, but it might turn out that instead of the
perfect 1000000 ticks a second ther would really be 983671 ticks.
A 2% error may not be a big problem for most people, of course. But it
might be a huge problem for others. Those people would have to do their
own re-calibration..
Linus
On Thu, Aug 01, 2002 at 09:30:04AM -0700, Linus Torvalds wrote:
> Absolutely. I think "jiffies64" is fine (as long as is it converted to
> some "standard" time-measure like microseconds or nanoseconds so that
> people don't have to care about internal kernel state) per se.
Hmmm, it almost sounds like implementing clock_gettime as a syscall and
exporting jiffies as CLOCK_MONOTONIC is the way to go, as that gives a
nanosecond resolution export of jiffies. Then, it would make sense to
use that as the basis for "when" timeouts. Relative timeouts still have
a certain simplicity to them that is appealing, though.
-ben
--
"You will be reincarnated as a toad; and you will be much happier."
On Thu, Aug 01, 2002 at 09:30:04AM -0700, Linus Torvalds wrote:
A 2% error may not be a big problem for most people, of course. But it
might be a huge problem for others. Those people would have to do their
own re-calibration..
How about export the value via a syscall and also export an 'error'
which for now could just be set to 5% or something conservative and
refined later if necessary or cleanup on other architectures,
something like:
/* export monotonically increasing nanoseconds since boot to
user-space. kt_time is a relative value, it does NOT necessarily
imply nanoseconds since boot. kt_err should be greater than
1stdev of the error in kt_time */
struct kern_time {
__u64 kt_ns;
__u64 kt_err;
};
--cw
On Thu, Aug 01, 2002 at 12:25:06PM -0700, Linus Torvalds wrote:
I seriously doubt that people really care _that_ much about a
precise time source for aio timeouts, and we should spend more
time on making it efficient and easy to use than on worrying about
the precision. People who do care can fall back to gettimeofday()
and try to correct for it that way.
In that case define the time to be approximate and nothing more.
The reason for the original suggestion was it seem feasible in the
future the syscall could be used for other purposes (multimedia
synchornisation) *and* be of value if made more precise without adding
yet another syscall at a later stage to do just this.
--cw
On Thu, 1 Aug 2002, Chris Wedgwood wrote:
>
> How about export the value via a syscall and also export an 'error'
> which for now could just be set to 5% or something conservative and
> refined later if necessary or cleanup on other architectures,
Ugh. That sounds like overdesign, and I hate overdesign.
The error is also rather hard to quantify, and only user land can do that
sanely anyway in the long run (ie the same reason why we have things like
/etc/adjtime - good guesses depend on history).
The kernel really shouldn't be involved in something like this.
I seriously doubt that people really care _that_ much about a precise
time source for aio timeouts, and we should spend more time on making it
efficient and easy to use than on worrying about the precision. People who
do care can fall back to gettimeofday() and try to correct for it that
way.
Linus
Hi!
> > For a lot of applications like multimedia you actually want a counting
> > of time not any relation to real time except that you can tell how many
> > ticks elapse a second.
>
> Absolutely. I think "jiffies64" is fine (as long as is it converted to
> some "standard" time-measure like microseconds or nanoseconds so that
> people don't have to care about internal kernel state) per se.
>
> The only thing that I think makes it less than wonderful is really the
> fact that we cannot give an accurate measure for it. We can _say_ that
> what we count in microseconds, but it might turn out that instead of the
> perfect 1000000 ticks a second ther would really be 983671 ticks.
>
> A 2% error may not be a big problem for most people, of course. But it
> might be a huge problem for others. Those people would have to do their
> own re-calibration..
I don't think so.
Imagine DVD playback. If you have 2% error, your audio is going to get
1 second off each minute. It is going to be off by one minute at the
end of hour. 2% is probably not acceptable.
[I'm not sure how exactly video/audio synchronization works, besides
fact it does not; but 2% could be huge problem for something like
that.]
Pavel
--
Casualities in World Trade Center: ~3k dead inside the building,
cryptography in U.S.A. and free speech in Czech Republic.
On Fri, 2002-08-02 at 09:24, Pavel Machek wrote:
> Imagine DVD playback. If you have 2% error, your audio is going to get
> 1 second off each minute. It is going to be off by one minute at the
> end of hour. 2% is probably not acceptable.
Nobody does DVD synchronization off a timer. You synchronize the video
to the audio because if the audio clock is a bit off it doesnt matter,
if you lock the audio to the video you get nasty clicks and skips.
2% is way too much for a lot of applications. Thats 28 minutes a day
On 2 Aug 2002, Alan Cox wrote:
>
> 2% is way too much for a lot of applications. Thats 28 minutes a day
Note that _most_ PC clocks are a hell of a lot better than 2% a day, so
that was really meant as the worst case for fairly broken hardware. But it
apparently does happen.
A more realistic schenario is less than 0.1%, but with the caveat that if
the machine goes to sleep, the error goes up to infinity..
(Think of the current "jiffies" update and gettimeofday() _without_ any
ntp or /etc/adjtime. For most people it is good enough to use as a wall
clock. But some people literally lose or gain a minute every hour.
That's the kind of drift I'm talking about).
Linus
On Thu, Aug 01, 2002 at 02:01:12PM -0400, Benjamin LaHaise wrote:
> On Thu, Aug 01, 2002 at 09:30:04AM -0700, Linus Torvalds wrote:
> > Absolutely. I think "jiffies64" is fine (as long as is it converted to
> > some "standard" time-measure like microseconds or nanoseconds so that
> > people don't have to care about internal kernel state) per se.
>
> Hmmm, it almost sounds like implementing clock_gettime as a syscall and
> exporting jiffies as CLOCK_MONOTONIC is the way to go, as that gives a
> nanosecond resolution export of jiffies. Then, it would make sense to
> use that as the basis for "when" timeouts. Relative timeouts still have
> a certain simplicity to them that is appealing, though.
this is all about reducing the latency window between the read of the
gettimeofday in userspace and the "add_timer" executed by the aio
syscall. But there will always be a window, because every timer is
programmed as a "timeout" not as an absolute time. And most important as
said in my previous email (no matter of the jiffies64 to userspace or
similar non second-unit) you will always need some gettimeofday (or
clock_gettime) in userspace too to detect if your program can keep up
with the load or if you're constantly running out of time. And as soon
as you run gettimeofday into userspace, you run in a similar window to
the one that you try to decrease (i.e. scheduling). Of course making
the window a kernel window inside a preempt_disable and a __cli() will
have a goodness effect in a number of cases, but I don't think it
matters significantly because you still need some gettimeofday in
userspace (or clock_gettime if that matters, clock_gettime infact is
even worse than gettimeofday due its certainly lower resolution).
Last but not the least the SuS specifications only contemplate a
timeout. So we couldn't take advantage of the "absolute time" (either in
second or jiffies64 units) from userspace unless you run into a non
standard user (not kernel) API.
Now reading the SuS specifications I also like less and less our current
kernel API of this sumbit_io, the SuS does exactly what I suggested
originally that is aio_read/aio_write/aio_fsync as separate calls. So
the merging effect mentioned by Ben cannot be taken advantage of by the
kernel anyways because userspace will issue separate calls for each
command.
See the SuS API:
http://www.opengroup.org/onlinepubs/007908799/xsh/aio.h.html
Can you please explain why you did a completely different kernel API and
you force glibc to wrap the whole thing internally on top of your
different kernel API? Wouldn't it be much simpler to use the clean SuS
API for the kernel too like we do for everything else? Do you expect
programmers to ignore completely the aio.h in glibc? I really like the
aio.h in glibc to be the default used by the programmers so we follow
a standard, and glibc should then rely on kernel to execute those
functions efficiently (instead of using threads). Or is something
foundamental missing (i.e. not doable) in the SuS API and so we're
forced to take a completely different route to provide the features we
need? I only had a short look at the SuS aio API so far and it seems to
provide the same functionalities at least in the most important paths.
In short the SuS API looks good to me and I don't see why we aren't
implementing it stright with kernel support (doing it in kernel will
also be faster because even if we may still implement the main I/O point
as a submit_io() function, wrappers in kernel code will runs faster, and
having a aio_abi.h that basically matches the glibc aio.h looks much
cleaner than having this current aio_abi.h intermediate thing that glibc
needs to wrap around).
Raising this question isn't in my interest because if somebody agrees
with me, this will delay further the registration of an API and that's a
problem, but I feel doing the right thing in raising it anyways just to
be sure this issue it's not ignored (I'll be very happy to hear that my
suggestion to implement the SuS API in the kernel is flawed for some
reason :). Otherwise if somebody agrees with me I'll do my best to avoid
adding delays despite we may choose to rewrite the API completely.
BTW, regardless if we want to make the kernel API match the user API
(like I think would be natural thing to do rather than having this
special kernel API), I really would like to have "a" kernel API
registered in 2.5 ASAP, otherwise I'll be forced to start using the
dynamic syscall too and I much prefer to spend my time doing useful
things instead. If no patch returns floating around for submission in
2.5 I will send new patches myself again.
Comments are welcome, (I will probably be offline until 20 August, so
replies aren't very urgent), thanks!
Andrea
On Fri, Aug 16, 2002 at 01:54:59AM +0200, Andrea Arcangeli wrote:
> the window a kernel window inside a preempt_disable and a __cli() will
> have a goodness effect in a number of cases, but I don't think it
> matters significantly because you still need some gettimeofday in
> userspace (or clock_gettime if that matters, clock_gettime infact is
> even worse than gettimeofday due its certainly lower resolution).
Yeah, I've come full circle back to the relative timeout point of view.
By grabbing a copy of jiffies at the beginning of the function the race
with preempt can be avoided.
> Now reading the SuS specifications I also like less and less our current
> kernel API of this sumbit_io, the SuS does exactly what I suggested
> originally that is aio_read/aio_write/aio_fsync as separate calls. So
> the merging effect mentioned by Ben cannot be taken advantage of by the
> kernel anyways because userspace will issue separate calls for each
> command.
Read it again. You've totally missed lio_listio. Also keep in mind what
happens with 4G/4G split for x86 which are needed to address the kernel
virtual memory starvation issues.
-ben
--
"You will be reincarnated as a toad; and you will be much happier."
On Thu, Aug 15, 2002 at 09:42:25PM -0400, Benjamin LaHaise wrote:
> Read it again. You've totally missed lio_listio. Also keep in mind what
you're saying you prefer glibc to wrap the aio_read/write/fsync and to
redirect all them to lio_listio after converting the iocb from user API to
kernel API, right? still I don't see why should we have different iocb,
I would understsand if you say we should simply overwrite aio_lio_opcode
inside the aio_read(3) inside glibc and to pass it over to kernel with a
single syscalls if it's low cost to just set the lio_opcode, but having
different data structures doesn't sounds the best still. I mean, it
would be nicer if things would be more consistent.
> happens with 4G/4G split for x86 which are needed to address the kernel
> virtual memory starvation issues.
I don't see how the flushing flood is related to this, this is a normal
syscall, any issue that applies to these aio_read/write/fsync should
apply to all other syscalls too. Also the 4G starvation will be more
likely fixed by x86-64 or in software by using a softpagesize larger
than 4k so that the mem_map array doesn't load all the zone_normal.
That'll break backwards compatibility w.r.t. to the page size offset but
it'll at least not generate a so significant performance regression for
syscall performance (again this is generic issue, not related to
async-io as far as I can tell).
Andrea
On Fri, Aug 16, 2002 at 03:57:17AM +0200, Andrea Arcangeli wrote:
> you're saying you prefer glibc to wrap the aio_read/write/fsync and to
> redirect all them to lio_listio after converting the iocb from user API to
> kernel API, right? still I don't see why should we have different iocb,
> I would understsand if you say we should simply overwrite aio_lio_opcode
> inside the aio_read(3) inside glibc and to pass it over to kernel with a
> single syscalls if it's low cost to just set the lio_opcode, but having
> different data structures doesn't sounds the best still. I mean, it
> would be nicer if things would be more consistent.
The iocb is as minimally different from the posix aio api as possible. The
main reason for the difference is that struct sigevent is unreasonably huge.
A lightweight posix aio implementation on top of the kernel API shares the
fields between the kernel iocb and the posix aiocb.
> I don't see how the flushing flood is related to this, this is a normal
> syscall, any issue that applies to these aio_read/write/fsync should
> apply to all other syscalls too. Also the 4G starvation will be more
> likely fixed by x86-64 or in software by using a softpagesize larger
> than 4k so that the mem_map array doesn't load all the zone_normal.
A 4G/4G split flushes the TLB on every syscall.
-ben
--
"You will be reincarnated as a toad; and you will be much happier."
On Thu, 15 Aug 2002, Benjamin LaHaise wrote:
>
> A 4G/4G split flushes the TLB on every syscall.
This is just not going to happen. It will have to continue being a 3/1G
split, and we'll just either find a way to move stuff to highmem and
shrink the "struct page", or we'll just say "screw those 16GB+ machines on
x86".
Linus
On Thu, Aug 15, 2002 at 07:08:30PM -0700, Linus Torvalds wrote:
>
> On Thu, 15 Aug 2002, Benjamin LaHaise wrote:
> >
> > A 4G/4G split flushes the TLB on every syscall.
>
> This is just not going to happen. It will have to continue being a 3/1G
> split, and we'll just either find a way to move stuff to highmem and
> shrink the "struct page", or we'll just say "screw those 16GB+ machines on
> x86".
I wish life were that simple. Unfortunately, struct page isn't the only
problem with these abominations: the system can run out of kvm for
vm_area_struct, task_struct, files... Personally, I *never* want to see
those data structures being kmap()'d as it would hurt kernel code quality
whereas a 4G/4G split is well confined, albeit sickening.
-ben
--
"You will be reincarnated as a toad; and you will be much happier."
On Thu, 15 Aug 2002, Linus Torvalds wrote:
> On Thu, 15 Aug 2002, Benjamin LaHaise wrote:
> >
> > A 4G/4G split flushes the TLB on every syscall.
>
> This is just not going to happen. It will have to continue being a 3/1G
> split, and we'll just either find a way to move stuff to highmem and
> shrink the "struct page", or we'll just say "screw those 16GB+ machines
> on x86".
I don't like a 4G/4G split at all, either.
But on the other hand, I don't hate it as much as all the
kludges that are being pushed into the kernel to support
these large machines right now ...
As long as it's just these huge machines that suffer, and
not the sane systems ;)
regards,
Rik
--
Bravely reimplemented by the knights who say "NIH".
http://www.surriel.com/ http://distro.conectiva.com/
On Thu, Aug 15, 2002 at 10:00:54PM -0400, Benjamin LaHaise wrote:
> On Fri, Aug 16, 2002 at 03:57:17AM +0200, Andrea Arcangeli wrote:
> > you're saying you prefer glibc to wrap the aio_read/write/fsync and to
> > redirect all them to lio_listio after converting the iocb from user API to
> > kernel API, right? still I don't see why should we have different iocb,
> > I would understsand if you say we should simply overwrite aio_lio_opcode
> > inside the aio_read(3) inside glibc and to pass it over to kernel with a
> > single syscalls if it's low cost to just set the lio_opcode, but having
> > different data structures doesn't sounds the best still. I mean, it
> > would be nicer if things would be more consistent.
>
> The iocb is as minimally different from the posix aio api as possible. The
> main reason for the difference is that struct sigevent is unreasonably huge.
> A lightweight posix aio implementation on top of the kernel API shares the
> fields between the kernel iocb and the posix aiocb.
/* extra parameters */
__u64 aio_reserved2; /* TODO: use this for a (struct sigevent *) */
__u64 aio_reserved3;
so you want the conversion to only store the pointer (if any) for the
sigevent in the iocb, rather than the whole sigevent, right? This is an
argument that has technical sense and that I can happily buy for having
a different iocb. However your argument also depends having I/O
completion notification via signal is the common case or not. I guess in
theory it should be the common case for software designed for best
performance.
>
> > I don't see how the flushing flood is related to this, this is a normal
> > syscall, any issue that applies to these aio_read/write/fsync should
> > apply to all other syscalls too. Also the 4G starvation will be more
> > likely fixed by x86-64 or in software by using a softpagesize larger
> > than 4k so that the mem_map array doesn't load all the zone_normal.
>
> A 4G/4G split flushes the TLB on every syscall.
sure, that's why it's so slow. This applies to
read/writes/exceptions/interrupts and everything else kernel side.
Andrea
On Thu, Aug 15, 2002 at 10:16:47PM -0400, Benjamin LaHaise wrote:
> On Thu, Aug 15, 2002 at 07:08:30PM -0700, Linus Torvalds wrote:
> >
> > On Thu, 15 Aug 2002, Benjamin LaHaise wrote:
> > >
> > > A 4G/4G split flushes the TLB on every syscall.
> >
> > This is just not going to happen. It will have to continue being a 3/1G
> > split, and we'll just either find a way to move stuff to highmem and
> > shrink the "struct page", or we'll just say "screw those 16GB+ machines on
> > x86".
>
> I wish life were that simple. Unfortunately, struct page isn't the only
> problem with these abominations: the system can run out of kvm for
> vm_area_struct, task_struct, files... Personally, I *never* want to see
> those data structures being kmap()'d as it would hurt kernel code quality
> whereas a 4G/4G split is well confined, albeit sickening.
after the mem_map is gone, there's still the option of CONFIG_2G or even
CONFIG_1G if kernel metadata is the problem. Of course it wouldn't be
a generic kernel, but I guess a 4G/4G would probably be even less
generic. In short we can do little at runtime to be generic. I guess a
16G with large softpagesize should be not too bad now that the
pagetables are in highmem, most problematic is >16G. Not that the
softpagesize is easy at all to implement (4G/4G is certainly simpler
because self contained in the include/arch) but at least it can payoff
for the lower mem setups too.
Andrea
On Thu, 15 Aug 2002, Benjamin LaHaise wrote:
> I wish life were that simple. Unfortunately, struct page isn't the only
> problem with these abominations: the system can run out of kvm for
> vm_area_struct, task_struct, files... Personally, I *never* want to see
> those data structures being kmap()'d as it would hurt kernel code quality
> whereas a 4G/4G split is well confined, albeit sickening.
A 4G/4G split will perform _so_ badly that it isn't even funny (it's also
technically impossible since you have to have some shared area anyway, but
you can get pretty close to it).
My bet is that we'll never do it due to performance issues. It's just
simpler to make the high pages end up being some special stuff (ie the old
"swap victim cache" etc that wouldn't show up to the VM proper).
Linus
On Thu, 15 Aug 2002, Linus Torvalds wrote:
>
> My bet is that we'll never do it due to performance issues. It's just
> simpler to make the high pages end up being some special stuff (ie the old
> "swap victim cache" etc that wouldn't show up to the VM proper).
Actually, the simplest schenario is to just make an arbitrary cut-off at
8G or 16G of RAM, and make anything above it default to the hugetlb zone,
and make that use a separate hugetlb map which does refcounts at 2MB
granularity). And create fake "struct page" entries for those things that
have to have it, along with a separate kmap area that holds a few of the
big mappings.
There's an almost complete overlap between people who want hugetlb and
64GB x86 machines anyway, so I doubt you'd find people to complain.
And the advantage of the hugetlb stuff is exactly the fact that the normal
VM doesn't need to worry about it. It's nonswappable, and doesn't get IO
done into it through any of the normal paths.
Minimal impact.
Linus
On Thu, Aug 15, 2002 at 08:50:58PM -0700, Linus Torvalds wrote:
> Actually, the simplest schenario is to just make an arbitrary cut-off at
> 8G or 16G of RAM, and make anything above it default to the hugetlb zone,
> and make that use a separate hugetlb map which does refcounts at 2MB
> granularity). And create fake "struct page" entries for those things that
> have to have it, along with a separate kmap area that holds a few of the
> big mappings.
> There's an almost complete overlap between people who want hugetlb and
> 64GB x86 machines anyway, so I doubt you'd find people to complain.
> And the advantage of the hugetlb stuff is exactly the fact that the normal
> VM doesn't need to worry about it. It's nonswappable, and doesn't get IO
> done into it through any of the normal paths.
> Minimal impact.
Ew! 64GB doesn't need or want any of that. It just needs bugfixes more
badly than most machines & page clustering to keep mem_map down to a
decent size IIRC. The "highmem ratio" is irrelevant, it's just
implementing page clustering and bugfixing. The core kernel doesn't
need to know the page size is fake, and needs the rest of the bugfixes
for e.g. buffer_heads proliferating out of control anyway. No magic.
No uglies.
32GB has already been booted & verified to run Linux. The only badness
is sizeof(mem_map) and the usual contingent of "buffer_heads are out of
control and shrink_cache() can't figure out when to shoot down a slab"
issues. And, well, performance issues show up now and then but those
improvements propagate back to the smaller machines, albeit in smaller
proportions.
This notion of cutting highmem boxen off at 16GB really does not sound
hot at all. At the very least webserving goes on at 32+GB and that has
no use whatsoever for the hugetlb stuff. Still other workloads, e.g.
client front ends for databases, are in similar positions. People are
actually trying to get things done that need more than 32GB pagecache
on i386 machines.
The big reason the database people are interested in hugetlb stuff is
to work around the pagetable proliferation bugs. They got a backdoor
to take over the VM, so they're even happier than they would be with
a bugfix. But they're not the only workloads around, and the bugfix is
still needed for all 64-bit and more general 32-bit workloads (it was
not fixed by pte-highmem).
Cheers,
Bill
On Thu, Aug 15, 2002 at 09:42:25PM -0400, Benjamin LaHaise wrote:
> > Now reading the SuS specifications I also like less and less our current
> > kernel API of this sumbit_io, the SuS does exactly what I suggested
> > originally that is aio_read/aio_write/aio_fsync as separate calls. So
> > the merging effect mentioned by Ben cannot be taken advantage of by the
> > kernel anyways because userspace will issue separate calls for each
> > command.
>
> Read it again. You've totally missed lio_listio. Also keep in mind what
>
Also, wasn't the fact that the API was designed to support both POSIX
and completion port style semantics, another reason for a different
(lightweight) in-kernel api? The c10k users of aio are likely to find
the latter model (i.e. completion ports) more efficient.
Regards
Suparna
On Fri, Aug 16, 2002 at 03:09:46PM +0530, Suparna Bhattacharya wrote:
> Also, wasn't the fact that the API was designed to support both POSIX
> and completion port style semantics, another reason for a different
> (lightweight) in-kernel api? The c10k users of aio are likely to find
> the latter model (i.e. completion ports) more efficient.
if it's handy for you, can you post a link to the API defined by
POSIX and completion ports so I can read them too and not only SuS?
btw, I don't see why there are so many API doing the same thing, I think
for the goodness of linux it would be nice to standardize and recommend
one of these user API so new software will use the API we recommend now,
rather than choosing almost randomly every time. So the rest will be
backwards compatibilty stuff for apps ported from other OS, and it will
be worthwhile to have the kernel API to match what we recommend as user
API.
Andrea
On Fri, Aug 16, 2002 at 12:03:34PM +0200, Andrea Arcangeli wrote:
> On Fri, Aug 16, 2002 at 03:09:46PM +0530, Suparna Bhattacharya wrote:
> > Also, wasn't the fact that the API was designed to support both POSIX
> > and completion port style semantics, another reason for a different
> > (lightweight) in-kernel api? The c10k users of aio are likely to find
> > the latter model (i.e. completion ports) more efficient.
>
> if it's handy for you, can you post a link to the API defined by
> POSIX and completion ports so I can read them too and not only SuS?
Don't have anything handy atm that's any better than what you could
get through doing a google on "IO Completion ports". (See section at
the end of this note for some info)
Completion port apis aren't really part of any standard, but provided
by some operating systems (NT, AS/400), most of which use a similar
interface. I personally found it useful to refer to the DAFS
completion groups API (DAFS API Spec at http://www.dafscollaborative.org)
just to get an idea of something that takes into account these various
existing interfaces to arrive at an interface for async i/o
completion (even though this really is all direct user-space api
implementation for remote file data access and nothing to do with
in kernel i/o interfaces).
>
> btw, I don't see why there are so many API doing the same thing, I think
> for the goodness of linux it would be nice to standardize and recommend
> one of these user API so new software will use the API we recommend now,
> rather than choosing almost randomly every time. So the rest will be
> backwards compatibilty stuff for apps ported from other OS, and it will
> be worthwhile to have the kernel API to match what we recommend as user
> API.
Since you are analysing this stuff I wonder if you have by any
chance looked through the aio design notes I had posted a while back.
I did try to discuss the background in terms of completion apis used
elsewhere even though I didn't record the specific details of those
interfaces. Am appending that section of the doc below.
Regards
Suparna
-------------------------------------------
2.5 Completion/Readiness notification:
Comment: Readiness notification can be treated as a completion of an
asynchonous operation to await readiness.
POSIX aio provides for waiting for completion of a particular request, or
for an array of requests, either by means of polling, or asynchronously
through signals. On some operating systems, there is a notion
of an I/O Completion port (IOCP), which provides a flexible and scalable way
of grouping completion events. One can associate multiple file descriptors
with such a completion port, so that all completion events for requests on
those files are sent to the completion port. The application can thus issue
a wait on the completion port in order to get notified of any completion
event for that group. The level of concurrency can be increased simply by
increasing the number of threads waiting on the completion port. There are
also certain additional concurrency control features that can be associated
with IOCPs (as on NT), where the system decides how many threads to
wakeup when completion events occur, depending on the concurrency limits
set for the queue, and the actual number of runnable threads at that moment.
Keeping the number of runnable threads constant in this manner protects
against blocking due to page faults and other operations that cannot be
performed asynchronously.
On a similar note, the DAFS api spec incorportes completion groups for
handling async i/o completion, the design being motivated by VI completion
queues, NT IOCPs and the Solaris aiowait interfaces. Association of an
i/o with a completion group (NULL would imply the default completion queue)
happens at the time of i/o submission which lets the provider know where
to place the event when it completes, contrary to aio_suspend style interface
which specifies the grouping only when waiting on completion.
This implementation for Linux makes use a similar notion to provide
support for completion queues. There are api's to setup and destroy such
completion queues, specifying the maximum queue lengths that a queue is
configured for. Every asynchronous i/o request is associated with a completion
queue when it is submitted (like the DAFS interfaces), and an application
can issue a wait on a given queue to be notified of a completion event for
any request associated with that queue.
BSD kqueue (Jonathan Lemon) provides a very generic method for registering
for and handling notification of events or conditions based on the concept
of filters of different types. This covers a wide range of conditions
including file/socket readiness notification (as in poll), directory/file
(vnode) change notifications, process create/exit/stop notifications, signal
notification, timer notification and also aio completion notification
(via SIGEV_EVENT). The kqueue is equivalent to a completion queue, and
the interface allows one to both register for events and wait for (and
pick up) any events on the queue within the same call. It is rather flexible
in terms of providing for various kinds of event registration/notification
requirements, e.g oneshot or everytime, temporary disabling, clearing
state if transitions need to be notifiied, and it supports both edge and
level triggered types of filters.
2.5.1 Some Requirements which are addressed:
1. Efficient for large numbers of events and connections
- The interface to register events to wait for should be separate from
the interface used to actually poll/wait for the registered events to
complete (unlike traditional poll/select), so that registrations can
hold across multiple poll waits with minimum user-kernel transfers.
(It is better to handle this at interface definition level than
through some kind of an internal poll cache)
The i/o submission routine takes a completion queue as a parameter,
which associates/registers the events with a given completion group/queue.
The application can issue multiple waits on the completion queue using a
separate interface.
- Ability to reap many events together (unlike current sigtimedwait
and sigwaitinfo interfaces)
The interface used to wait for and retrieve events, can return an
array of completed events rather than just a single event.
- Scalable/tunable queue limits - at least have a limit per queue rather
than system wide limits
Queue limits can be specified when creating a completion group.
TBD: A control interface for changing queue parameters/limits (e.g
io_queue_grow) might be useful
- Room for more flexible/tunable wakeup semantics for better concurrency
control
Since the core event queue can be separated from the notification mechanism,
the design allows one to provide for alternative wakeup semantics
to optimize concurrency and reduce redundant or under-utilized context
switches. Implementing these might require some additional parameters or
interfaces to be defined. BTW, it is desirable to provide a unified interface
for notification and event retrieval to a caller, to avoid synchronization
complexities, even if the core policies are separable underneath in-kernel.
[See the discussion in Sec 2.6 on wakeup policies for a more
detailed discussion on this]
2. Enable flexible grouping of operations
- Flexible grouping at the time of i/o submission
(different operations on the same fd can belong to different groups,
operations on different fds can belong to the same group)
- Ability to wait for at least a specified number of operations from
a specified group to complete (at least N vs at least 1 helps with
batching on the way up, so that the application can perform its post
processing activities in a batch, without redundant context switches)
The DAFS api supports such a notion, both in its cg_batch_wait interface
which returns when either N events have completed, or with less than N
events in case of a timeout, and also in the form of a num_completions
hint at the time of i/o submission. The latter is a hint that gets sent
out to the server as a characteristic of the completion queue or session,
so the server can use this hint to batch its responses accordingly.
Knowing that the caller is interested only in batch completions helps
with appropriate optimizations.
Note: The Linux aio implementation today only supports "at least one"
and not "at least N" (e.g the aio_nwait interface on AIX).
The tradeoffs between responsiveness and fairness issues tend to
to get amplified when considering "at least N" type of semantics,
and this is one of the main concerns in supporting it.
[See discussion on wakeup policies later]
- Support dynamic additions to the group rather than a static or one time
list passed through a single call
Multiple i/o submissions can specify the same completion group, enabling
events to be added to the group.
[Question: Is the option of the completion group being different from the
submission batch/group (i.e. per iocb grouping field) useful to have ?
Like POSIX using sigevent as part of iocb]
3. Should also be able to wait for a specific operation to complete (without
being very inefficient about it)
One could either have low overhead group setup/teardown so such an operation
may be assigned a group of its own (costs can be amortized across multiple
such operations by reusing the same group if possible) or provide an
interface to wait for a specific operation to complete.
The latter would be more useful, though it requires a per-request wait queue
or something similar. The current implementation has a syscall interface
defined for this (io_wait), which hasn't been coded up as yet. The plan is
to use hashed wait queues to conserve on space.
There are also some semantics issues in terms of possibilities of another
waiter on the queue picking up the corresponding completion event for this
operation. To address this, the io_wait interface might be modified to
include an argument for the returned event.
BTW, there is an option of dealing with this using the group primitives
either in user space, or even in kernel by waiting in a loop for any event
in the group until the desired event occurs, but this could involve some
extra interim wakeups / context switches under the covers, and a user
level event distribution mechanism for the other events picked up in the
meantime.
4. Enable Flexible distribution of responsibility across multiple
threads/components
Different threads can handle submission for different operations,
and another pool of threads could wait on completion.
The degree of concurrency can be improved simply by increasing threads
in the pool that wait for and process completion of operations for
that group.
5. Support for Prioritized Event Delivery
This involves the basic infrastructure to be able to accord higher
priority to the delivery of certain completion events over others,
(e.g. depending on the request priority settings of the corresponding
request), i.e. if multiple completion events have arrived on the
queue, then the events for higher priorities should be picked up
first by the application.
On Fri, Aug 16, 2002 at 04:53:06PM +0530, Suparna Bhattacharya wrote:
> On Fri, Aug 16, 2002 at 12:03:34PM +0200, Andrea Arcangeli wrote:
> > On Fri, Aug 16, 2002 at 03:09:46PM +0530, Suparna Bhattacharya wrote:
> > > Also, wasn't the fact that the API was designed to support both POSIX
> > > and completion port style semantics, another reason for a different
> > > (lightweight) in-kernel api? The c10k users of aio are likely to find
> > > the latter model (i.e. completion ports) more efficient.
> >
> > if it's handy for you, can you post a link to the API defined by
> > POSIX and completion ports so I can read them too and not only SuS?
>
> Don't have anything handy atm that's any better than what you could
> get through doing a google on "IO Completion ports". (See section at
> the end of this note for some info)
Oh sorry, I should have mentioned Dan Kegel's site which actually
has all the pointers you need. See http://www.kegel.com/c10k.html
(It has pointers to links to both NT and OS/400 completion ports)
Regards
Suparna
>> > A 4G/4G split flushes the TLB on every syscall.
>>
>> This is just not going to happen. It will have to continue being a 3/1G
>> split, and we'll just either find a way to move stuff to highmem and
>> shrink the "struct page", or we'll just say "screw those 16GB+ machines on
>> x86".
>
> I wish life were that simple. Unfortunately, struct page isn't the only
> problem with these abominations: the system can run out of kvm for
> vm_area_struct, task_struct, files... Personally, I *never* want to see
> those data structures being kmap()'d as it would hurt kernel code quality
> whereas a 4G/4G split is well confined, albeit sickening.
At least some of those you don't have to kmap ... at least not in
the traditional sense. This sort of thing is a good application
for the per-process (or per-task) kernel virtual address area.
you just map in the stuff you need for your own task, instead
of having to share the global space with everybody. Some things
have to be global (well, easier at least) like the task_struct,
but the kernel stacks could be moved out with a little work,
files, vm_area_structs, etc.
That sounds more appealing to me than either kmap or a 4G/4G split.
M.
On Fri, 16 Aug 2002, Martin J. Bligh wrote:
>
> At least some of those you don't have to kmap ... at least not in
> the traditional sense. This sort of thing is a good application
> for the per-process (or per-task) kernel virtual address area.
> you just map in the stuff you need for your own task, instead
> of having to share the global space with everybody.
Careful.
The VM space is shared _separately_ from other data structures, which
means that you can _not_ user per-VM virtual address areas and expect them
to scale with load. And than some VM happens to have thousands of threads,
and you're dead.
> Some things
> have to be global (well, easier at least) like the task_struct,
> but the kernel stacks could be moved out with a little work,
> files, vm_area_structs, etc.
Kernel stacks most certainly can't do this easily, since you'll just hit
the scalability problem somewhere else (ie many threads, same VM).
And files, for example, can not only be many files for one VM, you can
have the reverse too, ie many VM's, one file table.
Linus
>> At least some of those you don't have to kmap ... at least not in
>> the traditional sense. This sort of thing is a good application
>> for the per-process (or per-task) kernel virtual address area.
>> you just map in the stuff you need for your own task, instead
>> of having to share the global space with everybody.
>
> Careful.
>
> The VM space is shared _separately_ from other data structures, which
> means that you can _not_ user per-VM virtual address areas and expect them
> to scale with load. And than some VM happens to have thousands of threads,
> and you're dead.
OK ... not sure I understand the exact scenario you're evisioning,
but I can certainly see some problems in that area. There are two
different ways we could do this (or a combination of both), and I'm
not 100% sure if they solve the problems you mention, but it'd be
interesting to see what you think.
1. We have a per-process UKVA (user-kernel virtual address space),
which is great for per-process stuff like mapping pagetables. Dave
McCracken made an implementation of this that carves off a fixed
amount of space between the top of the stack and PAGE_OFFSET.
That makes highpte more efficient by saving the kmaps most of the
time (or it should).
2. A per task UKVA, that'd probably have to come out of something
like the vmalloc space. I think Bill Irwin derived something like
that from Dave's work, though I'm not sure it's complete & working
as yet. Per task things like the kernel stack (minus the task_struct
& waitqueues) could go in here.
> Kernel stacks most certainly can't do this easily, since you'll just hit
> the scalability problem somewhere else (ie many threads, same VM).
Does (2) solve some of the thread scalability problems you're worried
about?
> And files, for example, can not only be many files for one VM, you can
> have the reverse too, ie many VM's, one file table.
Could we fix this by having multiple tasks map the same page and share
it? Still much less vaddr space overhead than global?
Hopefully I haven't totally missed your point ... if so, hit me again,
but harder and slower ;-)
M.
On Fri, 16 Aug 2002, Martin J. Bligh wrote:
>
> 1. We have a per-process UKVA (user-kernel virtual address space),
What is your definition of a "process"?
Linux doesn't really have any such thing. Linux threads share different
amounts of stuff, and a traditional process just happens to share nothing.
However, since they _can_ share more, it's damn hard to see what a
"per-process" mapping means.
> 2. A per task UKVA, that'd probably have to come out of something
> like the vmalloc space. I think Bill Irwin derived something like
> that from Dave's work, though I'm not sure it's complete & working
> as yet. Per task things like the kernel stack (minus the task_struct
> & waitqueues) could go in here.
And what is your definition of a "task"?
You seem to think that a task is one thread ("per task things like the
kernel stack"), ie a 1:1 mapping with a "struct task_stuct".
But if you have such a mapping, then you _cannot_ make a per-task VM
space, because many tasks will share the same VM. You cannot even do a
per-cpu mapping change (and rewrite the VM on thread switch), since the VM
is _shared_ across CPU's, and absolutely has to be in order to work with
CPU's that do TLB fill in hardware (eg x86).
The fact is, that in order to get the right TLB behaviour, the _only_
thing you can do is to have a "per-MM UKVA". It's not per thread, and it's
not per process. It's one per MM, which is _neither_.
And this is where the problems come in. Since it is per-MM (and thus
shared across CPU's) updates need to be SMP-safe. And since it is per-MM,
it means that _any_ data structure that might be shared across different
MM's are really really dangerous to put in this thing (think virtual
caches on some hardware).
And since it is per-MM, it means that anything that there can be multiple
of per MM (which is pretty much _every_ data structure in the kernel)
cannot go at a fixed address or anything like that, but needs to be
allocated within the per-MM area dynamically.
I suspect that you are used to the traditional UNIX "process" notion,
where a "process" has exactly one file table, and has exactly one set of
signals, one set of semaphores etc. In that setup it can be quite
convenient to map these into the VM address space at magical addresses.
You may also be used to per-CPU page tables or software TLB fill
situations, where different CPU's can have different TLB contents. That
can be used to have per-thread mappings. Again, that doesn't work on Linux
due to page table sharing and hw TLB fills.
Linus
On Fri, Aug 16, 2002 at 09:00:52PM -0700, Linus Torvalds wrote:
> Careful.
> The VM space is shared _separately_ from other data structures, which
> means that you can _not_ user per-VM virtual address areas and expect them
> to scale with load. And than some VM happens to have thousands of threads,
> and you're dead.
This is a clear and present danger, and the strategies to deal with them,
although they're not ready for presentation, are in development.
At some point in the past, Martin Bligh wrote:
>> Some things have to be global (well, easier at least) like the
>> task_struct, but the kernel stacks could be moved out with a little
>> work, files, vm_area_structs, etc.
On Fri, Aug 16, 2002 at 09:00:52PM -0700, Linus Torvalds wrote:
> Kernel stacks most certainly can't do this easily, since you'll just hit
> the scalability problem somewhere else (ie many threads, same VM).
> And files, for example, can not only be many files for one VM, you can
> have the reverse too, ie many VM's, one file table.
Stacks are probably not the foremost priority here. First and foremost
come the bugs that stop smaller workloads cold. But as you've said,
scalability problems can and will arise in stacks and still other
things mapped in similar ways by the traditional UNIX architecture.
The demands 32-bit machines can make of a generic kernel are limited,
and this is understood. The intention of our developers is not to
corrupt the generic kernel with hacks that will not extend beyond 32
bits, but to exploit the hardware to which we have access to to expose
the largest number of scalability issues possible and address them. And
to the best of our abilities we will do the work necessary to address
them ourselves as opposed to presenting burdens on others. And ideally
we will also benefit common users with the same patches.
Please understand we are absolutely not interested in working against
your intentions but only working with them and taking on labor
ourselves to advance the state of the Linux kernel for all workloads,
both large and small. And that means swapping (which TPC/H does not do)
and IDE. Yes, I'm making promises that I myself cannot keep. But I am
confident that the support is present to such a degree that I am
absolutely certain the contributions and contributors to make them
happen will be funded, with or without my participation.
Cheers,
Bill
On Fri, 16 Aug 2002, Linus Torvalds wrote:
>
> I suspect that you are used to the traditional UNIX "process" notion,
> where a "process" has exactly one file table, and has exactly one set of
> signals, one set of semaphores etc. In that setup it can be quite
> convenient to map these into the VM address space at magical addresses.
Btw, at this point I should say that that doesn't mean that I'm _against_
a per-VM kmap table, I'm just pointing out that it's not a trivial issue.
Andrea and I talked about exactly this at OLS, because Andrea would have
liked to use it for handling the generic_file_write() kmap case without
having to worry about running out of kmap's and the deadlocks we used to
have in that space (before the atomic user copy approach).
And the thing is, you _can_ use a per-VM kmap setup, but it really only
moves the problem from a global kmap space ("everybody shares the same
VM") into a slightly smaller subset of it, a global thread kmap ("all
threads share the same VM").
So at least in that particular case, by moving it from a global space to a
per-VM space, the DoS wrt generic_file_write() didn't actually go away. It
just had to be triggered slightly differently (ie using lots of threads).
There may be other cases where this is ok. Moving to a per-VM kmap space
may not _fix_ some fundamental scalability problem, but it might move it
further out and make it a non-issue under normal load. Which is why I
don't think the idea is fundamentally flawed, I just wanted to point out
some of the traps to people since we've already almost fallen into some of
them..
Linus
>> 1. We have a per-process UKVA (user-kernel virtual address space),
>
> What is your definition of a "process"?
Sorry ... per shared address space ... so multiple tasks sharing an
mm are 1 process (to me at least).
> And what is your definition of a "task"?
>
> You seem to think that a task is one thread ("per task things like the
> kernel stack"), ie a 1:1 mapping with a "struct task_stuct".
yup, that's what I meant. task == 1 task_struct.
> But if you have such a mapping, then you _cannot_ make a per-task VM
> space, because many tasks will share the same VM. You cannot even do a
> per-cpu mapping change (and rewrite the VM on thread switch), since the VM
> is _shared_ across CPU's, and absolutely has to be in order to work with
> CPU's that do TLB fill in hardware (eg x86).
I don't see why the area above PAGE_OFFSET has to be global, or per
VM (by which I'm assuming you're meaning the set of pagetables per
process, aka group of tasks sharing an mm).
Assume 3 level page tables and a 3/1 user/kernel split for the sake
of argument. The bottom 3 PGD entries point to user PMD pages,
and the top 1 to the kernel PMD page. At the moment, the PGDs are per
VM, but say we make them per task instead ... each task also gets
a copy of the standard kernel PMD (which never changes in the normal
course of things). In that PMD, we tweak the top couple of entries
to point to a per-task set of entries ... but the rest of the PMD
entries all point back to a shared set of PTE entries (well, except
ZONE_NORMAL is all large pages, so there ain't none for those).
Yes, I guess you'd have to TLB flush on the context switch with
shared mm's which you don't have to do now, and you'd use an extra
couple of pages per task, but that's of phys ram, not vaddr space
which is what's precious. I think that all works, but it's kind of
late ;-)
It also has the advantage that you can wedge kernel text replication
for NUMA in the per-task PMD entries.
> The fact is, that in order to get the right TLB behaviour, the _only_
> thing you can do is to have a "per-MM UKVA". It's not per thread, and it's
> not per process. It's one per MM, which is _neither_.
OK, I was defining a process as the set of tasks sharing an MM.
You seem to have a different definition - could you clarify for me?
> And this is where the problems come in. Since it is per-MM (and thus
> shared across CPU's) updates need to be SMP-safe. And since it is per-MM,
> it means that _any_ data structure that might be shared across different
> MM's are really really dangerous to put in this thing (think virtual
> caches on some hardware).
OK ... I wasn't thinking virtual caches, I'll admit. But how many
crazy 32 bit architectures do we have wanting 64Gb of RAM? ;-)
I *think* this is OK for ia32? 64 bit machines don't care about
all this nonsense.
> And since it is per-MM, it means that anything that there can be multiple
> of per MM (which is pretty much _every_ data structure in the kernel)
> cannot go at a fixed address or anything like that, but needs to be
> allocated within the per-MM area dynamically.
right. I didn't mean to imply that it's trivial, or a panacea,
but it's an interesting concept.
M.
> Btw, at this point I should say that that doesn't mean that I'm _against_
> a per-VM kmap table, I'm just pointing out that it's not a trivial issue.
>
> Andrea and I talked about exactly this at OLS, because Andrea would have
> liked to use it for handling the generic_file_write() kmap case without
> having to worry about running out of kmap's and the deadlocks we used to
> have in that space (before the atomic user copy approach).
I know ;-) We talked over that one a lot ... but I think we ended
up just avoiding the problem (every solution we came up with sucked).
I wasn't so worried about avoiding kmap pool exhaustion as reducing
TLB flushing - if we put 4096 entries in the pool, and can cope with
blocking if the silly thing does happen to fill up, that seems OK.
The *really* nice thing is that you can do all sorts of smart stuff
with the TLB at this point. As we now duplicated the pool per VM,
it just got much, much larger. Chances are by the time you come to
re-use an entry you've already TLB flushed all the CPUs from context
switching. Keep track of that and you *never* tlb flush for kmap.
But in the end the two main users of kmap seem to be copy_to/from_user
stuff and highpte - using atomic for the former and the per-VM space
for the latter seems to fix the problem far enough that nobody could
see it anymore, so nobody cares ;-)
> And the thing is, you _can_ use a per-VM kmap setup, but it really only
> moves the problem from a global kmap space ("everybody shares the same
> VM") into a slightly smaller subset of it, a global thread kmap ("all
> threads share the same VM").
Well, I disagree with the "slightly" in that statement, but otherwise
yes ;-) If you're not running heavy threading, it works brilliantly.
If you are, it's better, but not necessarily brilliant.
> There may be other cases where this is ok. Moving to a per-VM kmap space
> may not _fix_ some fundamental scalability problem, but it might move it
> further out and make it a non-issue under normal load. Which is why I
> don't think the idea is fundamentally flawed, I just wanted to point out
> some of the traps to people since we've already almost fallen into some of
> them..
Yup, I think we ended up pushing it out by other means, but it's
still interesting. Things like this only need to be pushed out
until the timely death of 32 bit machines ;-)
If I'm not insane about the per-task stuff in the previous email,
that could be used for a per-task kmap, which would be much nicer
all round. But the chances of me being insane are pretty good ;-)
M.
On Fri, 16 Aug 2002, Martin J. Bligh wrote:
>
> I don't see why the area above PAGE_OFFSET has to be global, or per
> VM (by which I'm assuming you're meaning the set of pagetables per
> process, aka group of tasks sharing an mm).
Basic issue: if the VM's aren't _identical_ (in every way, including the
kernel one), they cannot share the page tables in an SMP environment with
two threads running on two CPU's at the same time.
And once you cannot share the page tables, you're screwed.
> Assume 3 level page tables and a 3/1 user/kernel split for the sake
> of argument.
No, no, that's the wrong way to go about it. You have to show a _portable_
way to do it, not a "if I assume this, I can do it".
For example, on x86 with the regular 2-level page tables, if you want to
have different kernel mappings, you have to copy the page directory
per-CPU, and then on task switch you have to change the PGD appropriately.
Which, btw, means that you have to invalidate the TLB for that CPU, even
if you would otherwise not have needed to. Look at how the lazy TLB
switching works, and realize that two threads can _switch_ CPU's as things
stand now, without ever a single TLB invalidate happening. They can take
over the TLB of the other thread when they move to another CPU. You'd
break that, horribly and fundamentally.
So to make this work, you'd have to have:
- architecture-specific hacks
- realize that not all architectures can do it at all, so the places that
depend on this would have to have some abstraction that makes it go
away when not needed.
- fix up lazy TLB switching (conditionally on the hack).
It just sounds really messy to me.
Linus
>> Assume 3 level page tables and a 3/1 user/kernel split for the sake
>> of argument.
>
> No, no, that's the wrong way to go about it. You have to show a
> _portable_ way to do it, not a "if I assume this, I can do it".
All I was doing was trying to illustrate that it's actually possible.
I wasn't actually proposing doing it like this, there's probably
better ways. Much less was I expecting you to like it - hell, I don't
like it either ;-) So when I go on below to try to prove that it
does work ... don't think I'm trying to sell it ;-)
Maybe this was Washer's idea, maybe it wasn't ... we talked this
over a lot for NUMA kernel text replication and I can't remember
who came up with it, to be perfectly honest. He might be happier
denying it anyway ;-)
> Basic issue: if the VM's aren't _identical_ (in every way, including the
> kernel one), they cannot share the page tables in an SMP environment with
> two threads running on two CPU's at the same time.
True ... to an extent. They can't share PGDs.
> And once you cannot share the page tables, you're screwed.
OK, firstly, I won't deny it's an arch specific hack. But ...
I don't think you're screwed, as long as you never have to update
the part that's split. Under the ia32 PAE mode, that's pretty much
going to be the case, as long as all PMDs are incarnated all the
time, which they're going to be anyway - there's text segment in the
first Gb, libraries in the second, stack in the third, and kernel
in the fourth. If for some reason that wasn't true, you'd just
create the blank PMD anyway.
So we have split the PGD per task, but the PMDs and PTEs for user
space are all still shared, which is all that matters, because
that's all we ever have to update. The PMD for kernel space is not
shared, but we never update that either.
> For example, on x86 with the regular 2-level page tables, if you
> want to have different kernel mappings, you have to copy the
> page directory per-CPU, and then on task switch you have to change
> the PGD appropriately.
Right, it doesn't work for 2 level pagetables very well. But I
can't see people without PAE actually wanting it. In fact, I don't
know of anyone but large ia32 PAE machines who'd want it ... would
be interested to hear about anyone else who's got this sort of
virtual address space pressure.
> Which, btw, means that you have to invalidate the TLB for that CPU, even
> if you would otherwise not have needed to. Look at how the lazy TLB
> switching works, and realize that two threads can _switch_ CPU's as things
> stand now, without ever a single TLB invalidate happening. They can take
> over the TLB of the other thread when they move to another CPU. You'd
> break that, horribly and fundamentally.
Yup, I know that, and I won't deny it's horrible ;-) Would be much
nicer if we had a flush_tlb_range that worked on that chip, but still.
Not good for heavy threading. But bear in mind the alternative we
were talking about (well, Ben was talking about, and you didn't want
to talk about ;-)) is TLB flushing every system call, not every
context switch between threads. Personally, I think that's worse.
> It just sounds really messy to me.
It is ;-) Implementing it vaguely cleanly would be hard. But I still
think it's an intriguing concept ... the other problem I've been
looking at is kernel text replication for ia32, and that's hard too.
This actually solves both problems, which is probably the only feather
in its cap. If anyone has any other ways to solve the replication
problem I'd be most interested ... (people muttered things about using
segmentation once in a dark and dingy corner, but refuse to admit who
they were).
M.
On Fri, 16 Aug 2002, Linus Torvalds wrote:
> But if you have such a mapping, then you _cannot_ make a per-task VM
> space, because many tasks will share the same VM. You cannot even do a
> per-cpu mapping change (and rewrite the VM on thread switch), since the
> VM is _shared_ across CPU's, and absolutely has to be in order to work
> with CPU's that do TLB fill in hardware (eg x86).
i'm just trying to insert the notion here that it *is* possible to do
'software TLB fill' on x86 as well - it's just too much pain and very
likely not worth it. The pgd entry of the top 4MB mapping can be filled in
temporarily, the space accessed (causing a hw TLB fill), and then the pgd
entry can be zeroed out again - keeping the 'soft filled TLB' still
intact. This assumes that the intermediate pgd value cannot be observed by
any other CPU - which can be achieved via either cross-CPU calls (lots of
overhead to the TLB miss 'handler'), or the hope that freshly accessed &
rewritten, locked cachelines are not seen by other CPUs, yet. (some CPUs
do define a certain window of non-observation for locked MESI lines, in
which the soft TLB handling stuff can be done, theoretically.) This
necessiates the disabling of interrupts, and worse, NMIs, so it's really
flaky.
once the TLB gets flushed it causes a fault again - at which point the
whole 'sw TLB fill' ordeal has to begin again.
so this is not practical at all, but perhaps interesting. If eg.
kernel-space used 4MB pages only for this purpose then we would not get
many 'TLB misses', because on most (all?) x86 CPUs the large-page TLBs are
isolated from the 4K page TLBs. They could even survive TLB flushes via
the PGE bit set.
but this is so hw-specific that the use of x86 segmentation looks like a
highlevel language in comparison :-)
Ingo
On Fri, Aug 16, 2002 at 10:12:04PM -0700, Martin J. Bligh wrote:
> Yes, I guess you'd have to TLB flush on the context switch with
> shared mm's which you don't have to do now, and you'd use an extra
Flushing tlb across thread context switch sounds the worst part. One of
the main reasons to use threads is to avoid tlb flushing across context
switches and to reduce the context switch nearly to a longjump through
kernel space (i.e. use threads instead of segmentation to skip the tlb
flushes across context switches). If we had ASN on the x86 that wouldn't
be such a big problem, a tlb flush in the common case would just bump
the current ASN, however the main lack of the x86 architecture is the
lack of tlb tagging (hope it'll be fixed soon, it's definitely fixable
without breaking backwards compatibility so eventually some x86
hardware vendor will wakeup and the others will have to follow). So I
would guess adding some per-VM locking like a mm->kmap_sem to serialize
the use of the per-VM pool of kmaps, sounds better unless we get address
space numbers on 32bit x86, over days of computations there are going to
be much more context switches than page faults, however it also depends
on the workload. The point about needing the tlb flush anyways for
replicated .text is valid, however not all the SMP highmem boxes are
necessiarly NUMA boxes too, and if they aren't NUMA I guess they prefer
not to flush the tlb over context switches. You may probably want to do
some measurements about the overhead of tlb flushes by adding a
__flush_tlb() in switch_to across two tasks (not across kernel threads
though) to simulate the behaviour of your proposed per-task PMD design.
Andrea
On Sat, Aug 17, 2002 at 10:02:23AM -0700, Linus Torvalds wrote:
> So to make this work, you'd have to have:
> - architecture-specific hacks
> - realize that not all architectures can do it at all, so the places that
> depend on this would have to have some abstraction that makes it go
> away when not needed.
> - fix up lazy TLB switching (conditionally on the hack).
>
> It just sounds really messy to me.
Indeed. Assuming this is an hack under a CONFIG_X86_NUMA_HACK hardwired
for certain config options and certain architecture, the tlb flushing
across threads sounds the worst part in particular because it's an x86.
Andrea
On Sat, Aug 17, 2002 at 02:27:34PM -0700, Martin J. Bligh wrote:
> Not good for heavy threading. But bear in mind the alternative we
> were talking about (well, Ben was talking about, and you didn't want
> to talk about ;-)) is TLB flushing every system call, not every
> context switch between threads. Personally, I think that's worse.
yes that's worse but that was meant to enlarge the ZONE_NORMAL, not to
reduce the kmap overhead. Even with the per-task VM virtual zone that
changes at every switch_to, you'd still have the ZONE_NORMAL shortage
problem.
> looking at is kernel text replication for ia32, and that's hard too.
> This actually solves both problems, which is probably the only feather
> in its cap. If anyone has any other ways to solve the replication
> problem I'd be most interested ... (people muttered things about using
> segmentation once in a dark and dingy corner, but refuse to admit who
> they were).
actually another way to do it is with .text replicated in the kernel
image at different virtual addresses 2M naturally aligned. So then you
can have each numa node kernel entry points set at different offsets,
and during context switch across nodes you can adjust the regs->eip
depending on the next node you're going to run on. Of course page faults
fixup exceptions will need to learn about this replicated text offsets
too. I'm not 100% sure it's really doable but at the moment I don't see
anything foundamental that forbids that. That would avoid the tlb
flushes across switch_to.
Andrea
> yes that's worse but that was meant to enlarge the ZONE_NORMAL, not to
> reduce the kmap overhead. Even with the per-task VM virtual zone that
> changes at every switch_to, you'd still have the ZONE_NORMAL shortage
> problem.
I disagree - the problem with ZONE_NORMAL is that we're stuffing
things that aren't global into a global space, and thus using up
space*NR_TASKS instead of space, which we would do if we remapped
things on a per-task basis. Effectively you're enlarging ZONE_NORMAL
by doing this - it's just not global any more.
> actually another way to do it is with .text replicated in the kernel
> image at different virtual addresses 2M naturally aligned. So then you
> can have each numa node kernel entry points set at different offsets,
> and during context switch across nodes you can adjust the regs->eip
> depending on the next node you're going to run on. Of course page faults
> fixup exceptions will need to learn about this replicated text offsets
> too. I'm not 100% sure it's really doable but at the moment I don't see
> anything foundamental that forbids that. That would avoid the tlb
> flushes across switch_to.
This would be roughly the plan if we were to do segmentation tricks,
but I can't see how you'd avoid rewriting all the pointer stuff (like
jumps) within the kernel by just having different virtual addresses.
Segmentation (setting CS to the offset) deals with that problem for
you I think, but I'm sure you understand this whole area better that
I do. More of a problem for 32 bit machines with this is it consumes
virtual address space at a rate of kernel_size_rounded_up_to_2M *
NR_NODES. See discussion above re: ZONE_NORMAL consumption ;-)
Kernel text replication is going to be an arch specific hack for every
case anyway ... ia64 has some magic to shove fake entries into the TLB
I believe, but we don't have that ;-(
M.
Dan Kegel wrote:
>You can actually consider posix AIO using sigtimedwait() to pick up completion
>notices to fit the definition of completion port if you squint a bit.
>
Except that signal queues are far too short to be useful for c10k. It's
also not possible to allocate a queue (signal number) in a thread safe
manner.
Posix AIO is a horrid interface. Ben has done much better.
Hi!
>
> But if you have such a mapping, then you _cannot_ make a per-task VM
> space, because many tasks will share the same VM. You cannot even do a
> per-cpu mapping change (and rewrite the VM on thread switch), since the VM
> is _shared_ across CPU's, and absolutely has to be in order to work with
> CPU's that do TLB fill in hardware (eg x86).
You could have different %cr3 on different CPUs and use page tables as TLBs
(emulating software-filled TLBs, basically); but that smells like "bye bye
performance".
Pavel
--
Philips Velo 1: 1"x4"x8", 300gram, 60, 12MB, 40bogomips, linux, mutt,
details at http://atrey.karlin.mff.cuni.cz/~pavel/velo/index.html.
Could somebody explain the semantics of the io_queue_wait call in the
libaio? If you pass nr == 0 to getevents, getevents will do nothing. I
don't see the point of it so I'm unsure what's the right implementation.
then about the 2.5 API we have such min_nr that allows the "at least
min_nr", instead of the previous default of "at least 1", so that it
allows implementing the aio_nwait of aix.
However the code checks for min_nr being > 0 but a min_nr == 0 will not
make sense. So min_nr should be always > 0 (infact the previous default
was at least 1, because as said at least 0 doesn't make sense). Same
issue with the nr, nr == 0 also doesn't make sense to me, and I think as
well nr should be > 0 (that's my issue with the apparently pointless
io_queue_wait too).
However as far as the API doesn't change much I'm fine, if there are
minor -EINVAL differences with bad inputs there should be not much
compatibility issues, and right now we're more permissive, so if
something 2.6 will be less permissive and it will guarantee apps for 2.6
will work right on current 2.5.
So what I'm doing now is to be in sync with 2.5, and I'm implementing
the io_queue_wait this way:
int io_queue_wait(io_context_t ctx, const struct timespec *timeout)
{
return io_getevents(ctx, 0, 0, NULL, timeout);
}
My preferred solution is to kill io_queue_wait that apparently only
generates a suprious lookup of the iocontext in-kernel, and then to
force min_nr > 0 and nr > 0. But I need your opinion on this, also
because you certainly know the semantics of io_queue_wait that I
couldn't easily reverse engeneer from the sourcecode (or maybe I
overlooked something in the sourcecode, possible).
Grepping l-k for io_queue_wait shows no results, google only shows the
glibc patches with no comment at all. The regression tests as well never
use it. Of course it's not a surprise since as far I can tell it cannot
do anything either old or new code, but I need to find if it is buggy or
if it should really be dropped.
BTW, the libaio I'm adapting to test on my tree will not have the
libredhat thing anymore, and it will use the mainline 2.5 API since the
API is registered now and in the very worst case a non backwards
compatible API change would happen in late 2.5, replacing libaio.so is
not more complex than replacing libredhat.so anyways ;).
Andrea
Changed the title to reflect the latest discussion. Just wanted
to comment on the nwait bit.
On Mon, Sep 02, 2002 at 08:40:43PM +0200, Andrea Arcangeli wrote:
>
> then about the 2.5 API we have such min_nr that allows the "at least
> min_nr", instead of the previous default of "at least 1", so that it
> allows implementing the aio_nwait of aix.
Partly, in the sense that the implementation still doesn't avoid
extra wakeups when less than min_nr events are available at a time
(if we are unlucky enough to have the min_nr events dripping in
slowly one at a time, we'd still have all those context switches,
won't we ?), though it saves on the extra user-kernel transitions
on those wakeups compared to if this were implemented in user-space
over an at-least-one primitive.
It is possible to play around with the implementation later though.
The important bit is having "at least N" in the interface exported
by the kernel, which is good.
Regards
Suparna
> Andrea
On Mon, Sep 02, 2002 at 08:40:43PM +0200, Andrea Arcangeli wrote:
> Could somebody explain the semantics of the io_queue_wait call in the
> libaio? If you pass nr == 0 to getevents, getevents will do nothing. I
> don't see the point of it so I'm unsure what's the right implementation.
It was supposed to wait for events to be ready. In reality what ended up
happening is that people don't actually like to use io_queue_run/wait and
the function callbacks as Linus originally suggested in the event model,
and instead they prefer to use io_getevents directly. libaio just hasn't
been updated to reflect that yet.
> then about the 2.5 API we have such min_nr that allows the "at least
> min_nr", instead of the previous default of "at least 1", so that it
> allows implementing the aio_nwait of aix.
It was also required to break source compilation for the timeout update.
> BTW, the libaio I'm adapting to test on my tree will not have the
> libredhat thing anymore, and it will use the mainline 2.5 API since the
> API is registered now and in the very worst case a non backwards
> compatible API change would happen in late 2.5, replacing libaio.so is
> not more complex than replacing libredhat.so anyways ;).
That was already the intent for libaio-0.4.0.
-ben
--
"You will be reincarnated as a toad; and you will be much happier."