Resend because of wordwrapping, thanks Andrew
S?bastien.
AIO completion notification
The current 2.6 kernel does not support notification of user space via
an RT signal upon an asynchronous IO completion. The POSIX specification
states that when an AIO request completes, a signal can be delivered to
the application as notification.
The aioevent patch adds a struct sigevent *aio_sigeventp to the iocb.
The relevant fields (pid, signal number and value) are stored in the kiocb
for use when the request completes.
That sigevent structure is filled by the application as part of the AIO
request preparation. Upon request completion, the kernel notifies the
application using those sigevent parameters. If SIGEV_NONE has been specified,
then the old behaviour is retained and the application must rely on polling
the completion queue using io_getevents().
A struct sigevent *aio_sigeventp field is added to struct iocb in
include/linux/aio_abi.h
A struct aio_notify containing the sigevent parameters is defined in aio.h:
struct aio_notify {
pid_t pid;
__u16 signo;
__u16 notify;
sigval_t value;
};
A struct aio_notify ki_notify is added to struct kiocb in include/linux/aio.h
In io_submit_one(), if the application provided a sigevent then
setup_sigevent() is called which does the following:
- check access to the user sigevent and make a local copy
- if the requested notification is SIGEV_NONE, then nothing to do
- fill in the kiocb->ki_notify fields (notify, signo, value)
- check sigevent consistency, get the signal target task and
save it in kiocb->ki_notify.pid
Upon request completion, in aio_complete(), if notification is needed for
this request (iocb->ki_notify.notify != SIGEV_NONE), then aio_send_signal()
is called to signal the target task as follows:
- fill in the siginfo struct to be sent to the task
- if notify is SIGEV_THREAD_ID then send signal to specific task
using specific_send_sig_info()
- else send signal to task group using __group_send_sig_info()
NOTE: specific_send_sig_info() has to be made non static and exported to be
used here.
fs/aio.c | 197 ++++++++++++++++++++++++++++++++++++++----------
include/linux/aio.h | 11 ++
include/linux/aio_abi.h | 3
include/linux/signal.h | 1
kernel/signal.c | 2
5 files changed, 172 insertions(+), 42 deletions(-)
Signed-off-by: Laurent Vivier <[email protected]>
Signed-off-by: S?bastien Dugu? <[email protected]>
Signed-off-by: Benjamin LaHaise <[email protected]>
Index: linux-2.6.18-rc6/fs/aio.c
===================================================================
--- linux-2.6.18-rc6.orig/fs/aio.c 2006-09-04 17:03:56.000000000 +0200
+++ linux-2.6.18-rc6/fs/aio.c 2006-09-06 16:38:10.000000000 +0200
@@ -925,6 +925,140 @@ void fastcall kick_iocb(struct kiocb *io
}
EXPORT_SYMBOL(kick_iocb);
+static void aio_send_signal(struct aio_notify *notify)
+{
+ struct siginfo info;
+ struct task_struct *p;
+ unsigned long flags;
+ int ret = -1;
+
+ memset(&info, 0, sizeof(struct siginfo));
+
+ info.si_signo = notify->signo;
+ info.si_errno = 0;
+ info.si_code = SI_ASYNCIO;
+ info.si_pid = 0;
+ info.si_uid = 0;
+ info.si_value = notify->value;
+
+ read_lock(&tasklist_lock);
+ p = find_task_by_pid(notify->pid);
+
+ if (!p || !p->sighand)
+ goto out_unlock;
+
+ spin_lock_irqsave(&p->sighand->siglock, flags);
+
+ if (notify->notify & SIGEV_THREAD_ID)
+ ret = specific_send_sig_info(notify->signo, &info, p);
+ else
+ ret = __group_send_sig_info(notify->signo, &info, p);
+
+
+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
+
+ if (ret)
+ printk(KERN_DEBUG "aio_send_signal: failed to send signal %d to %d\n",
+ notify->signo, notify->pid);
+out_unlock:
+ read_unlock(&tasklist_lock);
+}
+
+static struct task_struct * good_sigevent(sigevent_t * event)
+{
+ struct task_struct *target = current->group_leader;
+
+ if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
+ (!(target = find_task_by_pid(event->sigev_notify_thread_id)) ||
+ target->tgid != current->tgid ||
+ (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
+ return NULL;
+
+ if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
+ ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
+ return NULL;
+
+ return target;
+}
+
+static long setup_sigevent(struct aio_notify *notify,
+ struct sigevent __user *user_event)
+{
+ int error = 0;
+ sigevent_t event;
+ struct task_struct *target;
+
+ if (!access_ok(VERIFY_READ, user_event, sizeof(struct sigevent)))
+ return -EFAULT;
+
+ if (copy_from_user(&event, user_event, sizeof (event)))
+ return -EFAULT;
+
+ /* Check for the SIGEV_NONE case */
+ if (event.sigev_notify == SIGEV_NONE)
+ return 0;
+
+ /* Setup the request completion notification parameters */
+ notify->notify = event.sigev_notify;
+ notify->signo = event.sigev_signo;
+ notify->value = event.sigev_value;
+
+ /* Now get the notification target */
+ read_lock(&tasklist_lock);
+
+ if ((target = good_sigevent(&event)))
+ notify->pid = target->pid;
+ else
+ error = -EINVAL;
+
+ read_unlock(&tasklist_lock);
+
+ return error;
+}
+
+static void __aio_write_evt(struct kioctx *ctx, struct io_event *event)
+{
+ struct aio_ring_info *info;
+ struct aio_ring *ring;
+ struct io_event *ring_event;
+ unsigned long tail;
+
+ info = &ctx->ring_info;
+
+ /* add a completion event to the ring buffer.
+ * must be done holding ctx->ctx_lock to prevent
+ * other code from messing with the tail
+ * pointer since we might be called from irq
+ * context.
+ */
+
+ ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
+
+ tail = info->tail;
+ ring_event = aio_ring_event(info, tail, KM_IRQ0);
+ if (++tail >= info->nr)
+ tail = 0;
+
+ *ring_event = *event;
+
+ dprintk("aio_write_evt: %p[%lu]: %Lx %Lx %Lx %Lx\n",
+ ctx, tail, event->obj, event->data, event->res, event->res2);
+
+ /* after flagging the request as done, we
+ * must never even look at it again
+ */
+
+ smp_wmb(); /* make event visible before updating tail */
+
+ info->tail = tail;
+ ring->tail = tail;
+
+ put_aio_ring_event(ring_event, KM_IRQ0);
+ kunmap_atomic(ring, KM_IRQ1);
+
+ pr_debug("added to ring at [%lu]\n", tail);
+}
+
/* aio_complete
* Called when the io request on the given iocb is complete.
* Returns true if this is the last user of the request. The
@@ -933,11 +1067,8 @@ EXPORT_SYMBOL(kick_iocb);
int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
{
struct kioctx *ctx = iocb->ki_ctx;
- struct aio_ring_info *info;
- struct aio_ring *ring;
- struct io_event *event;
+ struct io_event event;
unsigned long flags;
- unsigned long tail;
int ret;
/*
@@ -955,14 +1086,13 @@ int fastcall aio_complete(struct kiocb *
return 1;
}
- info = &ctx->ring_info;
+ /* insert event in the event ring */
+
+ event.obj = (u64)(unsigned long)iocb->ki_obj.user;
+ event.data = iocb->ki_user_data;
+ event.res = res;
+ event.res2 = res2;
- /* add a completion event to the ring buffer.
- * must be done holding ctx->ctx_lock to prevent
- * other code from messing with the tail
- * pointer since we might be called from irq
- * context.
- */
spin_lock_irqsave(&ctx->ctx_lock, flags);
if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list))
@@ -975,34 +1105,10 @@ int fastcall aio_complete(struct kiocb *
if (kiocbIsCancelled(iocb))
goto put_rq;
- ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
-
- tail = info->tail;
- event = aio_ring_event(info, tail, KM_IRQ0);
- if (++tail >= info->nr)
- tail = 0;
-
- event->obj = (u64)(unsigned long)iocb->ki_obj.user;
- event->data = iocb->ki_user_data;
- event->res = res;
- event->res2 = res2;
-
- dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n",
- ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
- res, res2);
-
- /* after flagging the request as done, we
- * must never even look at it again
- */
- smp_wmb(); /* make event visible before updating tail */
-
- info->tail = tail;
- ring->tail = tail;
-
- put_aio_ring_event(event, KM_IRQ0);
- kunmap_atomic(ring, KM_IRQ1);
+ __aio_write_evt(ctx, &event);
- pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+ if (iocb->ki_notify.notify != SIGEV_NONE)
+ aio_send_signal(&iocb->ki_notify);
pr_debug("%ld retries: %d of %d\n", iocb->ki_retried,
iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes);
@@ -1481,8 +1587,7 @@ int fastcall io_submit_one(struct kioctx
ssize_t ret;
/* enforce forwards compatibility on users */
- if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2 ||
- iocb->aio_reserved3)) {
+ if (unlikely(iocb->aio_reserved1)) {
pr_debug("EINVAL: io_submit: reserve field set\n");
return -EINVAL;
}
@@ -1491,6 +1596,7 @@ int fastcall io_submit_one(struct kioctx
if (unlikely(
(iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
(iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
+ (iocb->aio_sigeventp != (unsigned long)iocb->aio_sigeventp) ||
((ssize_t)iocb->aio_nbytes < 0)
)) {
pr_debug("EINVAL: io_submit: overflow check\n");
@@ -1525,6 +1631,17 @@ int fastcall io_submit_one(struct kioctx
INIT_LIST_HEAD(&req->ki_wait.task_list);
req->ki_retried = 0;
+ /* handle setting up the sigevent for POSIX AIO signals */
+ req->ki_notify.notify = SIGEV_NONE;
+
+ if (iocb->aio_sigeventp) {
+ ret = setup_sigevent(&req->ki_notify,
+ (struct sigevent __user *)(unsigned long)
+ iocb->aio_sigeventp);
+ if (ret)
+ goto out_put_req;
+ }
+
ret = aio_setup_iocb(req);
if (ret)
Index: linux-2.6.18-rc6/include/linux/aio_abi.h
===================================================================
--- linux-2.6.18-rc6.orig/include/linux/aio_abi.h 2006-06-18 03:49:35.000000000 +0200
+++ linux-2.6.18-rc6/include/linux/aio_abi.h 2006-09-06 16:32:17.000000000 +0200
@@ -80,8 +80,9 @@ struct iocb {
__u64 aio_nbytes;
__s64 aio_offset;
+ __u64 aio_sigeventp; /* pointer to struct sigevent */
+
/* extra parameters */
- __u64 aio_reserved2; /* TODO: use this for a (struct sigevent *) */
__u64 aio_reserved3;
}; /* 64 bytes */
Index: linux-2.6.18-rc6/include/linux/aio.h
===================================================================
--- linux-2.6.18-rc6.orig/include/linux/aio.h 2006-06-18 03:49:35.000000000 +0200
+++ linux-2.6.18-rc6/include/linux/aio.h 2006-09-06 16:32:17.000000000 +0200
@@ -6,6 +6,7 @@
#include <linux/aio_abi.h>
#include <asm/atomic.h>
+#include <asm/siginfo.h>
#define AIO_MAXSEGS 4
#define AIO_KIOGRP_NR_ATOMIC 8
@@ -48,6 +49,13 @@ struct kioctx;
#define kiocbIsKicked(iocb) test_bit(KIF_KICKED, &(iocb)->ki_flags)
#define kiocbIsCancelled(iocb) test_bit(KIF_CANCELLED, &(iocb)->ki_flags)
+struct aio_notify {
+ pid_t pid;
+ __u16 signo;
+ __u16 notify;
+ sigval_t value;
+};
+
/* is there a better place to document function pointer methods? */
/**
* ki_retry - iocb forward progress callback
@@ -115,6 +123,9 @@ struct kiocb {
struct list_head ki_list; /* the aio core uses this
* for cancellation */
+
+ /* to notify a process on I/O event */
+ struct aio_notify ki_notify;
};
#define is_sync_kiocb(iocb) ((iocb)->ki_key == KIOCB_SYNC_KEY)
Index: linux-2.6.18-rc6/include/linux/signal.h
===================================================================
--- linux-2.6.18-rc6.orig/include/linux/signal.h 2006-09-04 17:07:26.000000000 +0200
+++ linux-2.6.18-rc6/include/linux/signal.h 2006-09-05 16:37:23.000000000 +0200
@@ -233,6 +233,7 @@ static inline int valid_signal(unsigned
return sig <= _NSIG ? 1 : 0;
}
+extern int specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t);
extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p);
extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *);
extern long do_sigpending(void __user *, unsigned long);
Index: linux-2.6.18-rc6/kernel/signal.c
===================================================================
--- linux-2.6.18-rc6.orig/kernel/signal.c 2006-09-04 17:07:56.000000000 +0200
+++ linux-2.6.18-rc6/kernel/signal.c 2006-09-05 16:34:18.000000000 +0200
@@ -763,7 +763,7 @@ out_set:
(((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig)))
-static int
+int
specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
{
int ret = 0;
-----------------------------------------------------
S?bastien Dugu? BULL/FREC:B1-247
phone: (+33) 476 29 77 70 Bullcom: 229-7770
mailto:[email protected]
Linux POSIX AIO: http://www.bullopensource.org/posix
http://sourceforge.net/projects/paiol
-----------------------------------------------------
> AIO completion notification
Thanks for sending this again.
> The current 2.6 kernel does not support notification of user space via
> an RT signal upon an asynchronous IO completion. The POSIX specification
> states that when an AIO request completes, a signal can be delivered to
> the application as notification.
Does it say anything about the reliability of that signal delivery? The
current implementation lets IO submission succeed but signal delivery
fail and that makes me nervous. What are the API design goals here? If
I missed some previous discussion could we have them summarized in a
comment above aio_send_signal() ? More inline..
> +static void aio_send_signal(struct aio_notify *notify)
> +{
> + struct siginfo info;
That's pretty big to be putting on the stack down under aio_complete()
which itself is called from block -> fs completion handlers. It's a
path with pretty strong stack pressure. It looks like we only use a
fraction of the struct, too, so can we rework the signal delivery code a
little to allow us to specify our inputs more efficiently? I guess it's
not a big deal, but it'd be nice to get right if we can.
> + info.si_code = SI_ASYNCIO;
It looks like USB (usbfs? urb mumble something?) is already using
SI_ASYNCIO. Is that a problem?
> + if (notify->notify & SIGEV_THREAD_ID)
> + ret = specific_send_sig_info(notify->signo, &info, p);
> + else
> + ret = __group_send_sig_info(notify->signo, &info, p);
> +
> +
> + spin_unlock_irqrestore(&p->sighand->siglock, flags);
> +
> + if (ret)
> + printk(KERN_DEBUG "aio_send_signal: failed to send signal %d to %d\n",
> + notify->signo, notify->pid);
It looks like this is trivial to make fail if one submits more
operations than the RLIMIT_SIGPENDING rlimit will allow queued signals
for. So we definitely don't want a printk that the user can generate.
This is where I'm nervous about what promises we've made about signal
delivery back at submit time :). If the posix API allows lossy delivery
then we're OK.
And it can fail if delivery can't allocate a struct sigqueue. I wonder
if we should allocate one at submit time and hang it off the iocb to be
used at completion. It's just a little struct with a list_head.
> +static struct task_struct * good_sigevent(sigevent_t * event)
This is lifted directly from kernel/posix-timers.c, including the lack
of any indication that it has to be called with the tasklist_lock held
'cause it calls find_task_by_pid(). If we're making this a shared
notion lets put it somewhere that both posix-timers.c and aio.c can get
at. kernel/signal.c, presumably?
> + if (!access_ok(VERIFY_READ, user_event, sizeof(struct sigevent)))
> + return -EFAULT;
You don't need this access_ok(), I don't think. copy_from_user() should
return non-zero if something goes wrong.
> + if (copy_from_user(&event, user_event, sizeof (event)))
> + return -EFAULT;
sigevent is chock-full of members that need compat help when 32bit
userspace hands it to a 64bit kernel. See compat_sys_timer_create() ->
get_compat_sigevent().
I'm not sure how best to handle this. It'd be pretty gross to allocate
compat copies of every sigevent on the stack like compat_sys_io_submit()
seems to do for the 32bit iocb pointers. I imagine you'll want to
conditionally call get_compat_sigevent() for each sigevent somehow.
Or push the problem to userspace by pointing from the iocb to a special
case sigevent with fixed width notify, signo, and value.
None of those options seem thrilling :/.
> +static void __aio_write_evt(struct kioctx *ctx, struct io_event *event)
Making this its own function seems to be unrelated to the completion
signaling work. Please drop it from the patch so that we don't have to
audit it to make sure that code motion didn't mess something up.
- z
On Tue, 2006-09-12 at 15:52 -0700, Zach Brown wrote:
> > AIO completion notification
>
> Thanks for sending this again.
>
> > The current 2.6 kernel does not support notification of user space via
> > an RT signal upon an asynchronous IO completion. The POSIX specification
> > states that when an AIO request completes, a signal can be delivered to
> > the application as notification.
>
> Does it say anything about the reliability of that signal delivery? The
> current implementation lets IO submission succeed but signal delivery
> fail and that makes me nervous. What are the API design goals here? If
> I missed some previous discussion could we have them summarized in a
> comment above aio_send_signal() ? More inline..
I could not find anything concerning signal delivery reliability, but
POSIX AIO being part of the realtime POSIX extension, I'd tend to think
reliability should be taken into acount.
Aside from that could you elaborate on the signal delivery
failures you have in mind? From what I gathered, the only failure I can
see is if __sigqueue_alloc() fails to allocate from the cache or we
reached the RLIMIT_SIGPENDING limit for the task.
>
> > +static void aio_send_signal(struct aio_notify *notify)
> > +{
> > + struct siginfo info;
>
> That's pretty big to be putting on the stack down under aio_complete()
> which itself is called from block -> fs completion handlers. It's a
> path with pretty strong stack pressure. It looks like we only use a
> fraction of the struct, too, so can we rework the signal delivery code a
> little to allow us to specify our inputs more efficiently? I guess it's
> not a big deal, but it'd be nice to get right if we can.
Right, I'm open to suggestions here.
>
> > + info.si_code = SI_ASYNCIO;
>
> It looks like USB (usbfs? urb mumble something?) is already using
> SI_ASYNCIO. Is that a problem?
Well SusV3 states: SI_ASYNCIO - Signal generated by completion of an
asynchronous I/O request.
From what I understand, the usb core code uses SI_ASYNCIO in two
places:
- in usbfs_remove_device() and it's a mystery to me why it needs to
set si_code
- in async_completed() to notify upon completion of an async IO
operation which is consistent with the specification
Anyway, I don't see it as a problem, but maybe I'm missing something.
>
> > + if (notify->notify & SIGEV_THREAD_ID)
> > + ret = specific_send_sig_info(notify->signo, &info, p);
> > + else
> > + ret = __group_send_sig_info(notify->signo, &info, p);
> > +
> > +
> > + spin_unlock_irqrestore(&p->sighand->siglock, flags);
> > +
> > + if (ret)
> > + printk(KERN_DEBUG "aio_send_signal: failed to send signal %d to %d\n",
> > + notify->signo, notify->pid);
>
> It looks like this is trivial to make fail if one submits more
> operations than the RLIMIT_SIGPENDING rlimit will allow queued signals
> for. So we definitely don't want a printk that the user can generate.
Argh, yes.
>
> This is where I'm nervous about what promises we've made about signal
> delivery back at submit time :). If the posix API allows lossy delivery
> then we're OK.
>
> And it can fail if delivery can't allocate a struct sigqueue. I wonder
> if we should allocate one at submit time and hang it off the iocb to be
> used at completion. It's just a little struct with a list_head.
That may be a solution if signal delivery reliability is
critical, so that we fail at submission time.
>
> > +static struct task_struct * good_sigevent(sigevent_t * event)
>
> This is lifted directly from kernel/posix-timers.c, including the lack
> of any indication that it has to be called with the tasklist_lock held
> 'cause it calls find_task_by_pid(). If we're making this a shared
> notion lets put it somewhere that both posix-timers.c and aio.c can get
> at. kernel/signal.c, presumably?
Good point, I will do that along with a comment stating the locking
constraints.
>
> > + if (!access_ok(VERIFY_READ, user_event, sizeof(struct sigevent)))
> > + return -EFAULT;
>
> You don't need this access_ok(), I don't think. copy_from_user() should
> return non-zero if something goes wrong.
OK, but then the comment for __copy_from_user_inatomic() in
asm-*/uaccess.h should be changed ("Caller must check the specified
block with access_ok() before calling this function").
>
> > + if (copy_from_user(&event, user_event, sizeof (event)))
> > + return -EFAULT;
>
> sigevent is chock-full of members that need compat help when 32bit
> userspace hands it to a 64bit kernel. See compat_sys_timer_create() ->
> get_compat_sigevent().
>
> I'm not sure how best to handle this. It'd be pretty gross to allocate
> compat copies of every sigevent on the stack like compat_sys_io_submit()
> seems to do for the 32bit iocb pointers. I imagine you'll want to
> conditionally call get_compat_sigevent() for each sigevent somehow.
>
> Or push the problem to userspace by pointing from the iocb to a special
> case sigevent with fixed width notify, signo, and value.
>
> None of those options seem thrilling :/.
Well, I'm not sure either and I'm open for discussion.
>
> > +static void __aio_write_evt(struct kioctx *ctx, struct io_event *event)
>
> Making this its own function seems to be unrelated to the completion
> signaling work. Please drop it from the patch so that we don't have to
> audit it to make sure that code motion didn't mess something up.
Oops, that slipped past into the patch, will remove.
Thanks, for your comments.
S?bastien.
--
-----------------------------------------------------
S?bastien Dugu? BULL/FREC:B1-247
phone: (+33) 476 29 77 70 Bullcom: 229-7770
mailto:[email protected]
Linux POSIX AIO: http://www.bullopensource.org/posix
http://sourceforge.net/projects/paiol
-----------------------------------------------------