Hi
Here is the latest rework of the AIO completion signal notification patches.
This set consists in 5 patches:
1. rework-compat-sys-io-submit: cleanup the sys_io_submit() compat layer,
making it more efficient and laying out the base for the following patches
2. aio-header-fix-includes: fixes the double inclusion of uio.h in aio.h
3. export-good_sigevent: move good_sigevent into signal.c and make it
non-static
4. aio-notify-sig: the AIO completion signal notification
5. listio: adds listio support
Description are in the individual patches.
Changes from v3:
All changes following comments from Zach Brown and Christoph Hellwig
- added justification for the compat_sys_io_submit() cleanup
- more cleanups in compat_sys_io_submit() to put it in line with
sys_io_submit()
- Changed "Export good_sigevent()" patch name to "Make good_sigevent()
non-static" to better describe what it does.
- Reworked good_sigevent() to make it more readable.
- Simplified the use of the SIGEV_* constants in signal notification
- Take a reference on the target task both for the SIGEV_THREAD_ID and
SIGEV_SIGNAL cases.
Changes from v2:
- rebased to 2.6.19-rc6-mm2
- reworked the sys_io_submit() compat layer as suggested by Zach Brown
- fixed saving of a pointer to a task struct in aio-notify-sig as
pointed out by Andrew Morton
Changes from v1:
- cleanups suggested by Christoph Hellwig, Badari Pulavarty and Zach Brown
- added lisio patch
Thanks for your comments, more welcomed, as usual.
Thanks,
S?bastien.
POSIX listio support
This patch adds POSIX listio completion notification support. It builds
on support provided by the aio signal notification patch and adds an
IOCB_CMD_GROUP command to io_submit().
The purpose of IOCB_CMD_GROUP is to group together the following requests in
the list up to the end of the list sumbitted to io_submit.
As io_submit already accepts an array of iocbs, as part of listio submission,
the user process prepends to a list of requests an empty special aiocb with
an aio_lio_opcode of IOCB_CMD_GROUP, filling only the aio_sigevent fields.
An IOCB_CMD_GROUP is added to the IOCB_CMD enum in include/linux/aio_abi.h
A struct lio_event is added in include/linux/aio.h
A struct lio_event *ki_lio is added to struct iocb in include/linux/aio.h
In io_submit(), upon detecting such an IOCB_CMD_GROUP marker iocb, an
lio_event is created in lio_create() which contains the necessary information
for signaling a thread (signal number, pid, notify type and value) along with
a count of requests attached to this event.
The following depicts the lio_event structure:
struct lio_event {
atomic_t lio_users;
struct aio_notify lio_notify;
};
lio_users holds an atomic counter of the number of requests attached to this
lio. It is incremented with each request submitted and decremented at each
request completion. When the counter reaches 0, we send the notification.
Each subsequent submitted request is attached to this lio_event by setting
the request kiocb->ki_lio to that lio_event (in io_submit_one()) and
incrementing the lio_users count.
In aio_complete(), if the request is attached to an lio (ki_lio <> 0),
then lio_check() is called to decrement the lio_users count and eventually
signal the user process when all the requests in the group have completed.
The IOCB_CMD_GROUP semantic is as follows:
- if the associated sigevent is NULL then we want to group
requests for the purpose of blocking on the group completion
(LIO_WAIT sync behavior).
- if the associated sigevent is valid (not NULL) then we want to
group requests for the purpose of being notified upon that
group of requests completion (LIO_NOWAIT async behaviour).
fs/aio.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++--
fs/compat.c | 62 +++++++++++++++++++++++-
include/linux/aio.h | 15 +++++
include/linux/aio_abi.h | 1
4 files changed, 192 insertions(+), 9 deletions(-)
Signed-off-by: S?bastien Dugu? <[email protected]>
Signed-off-by: Laurent Vivier <[email protected]>
Index: linux-2.6.19-rc6-mm2/fs/aio.c
===================================================================
--- linux-2.6.19-rc6-mm2.orig/fs/aio.c 2006-11-30 15:18:52.000000000 +0100
+++ linux-2.6.19-rc6-mm2/fs/aio.c 2006-11-30 15:27:27.000000000 +0100
@@ -414,6 +414,7 @@ static struct kiocb fastcall *__aio_get_
req->ki_cancel = NULL;
req->ki_retry = NULL;
req->ki_dtor = NULL;
+ req->ki_lio = NULL;
req->private = NULL;
req->ki_iovec = NULL;
req->ki_notify.sigq = NULL;
@@ -1009,6 +1010,53 @@ out_unlock:
return -EINVAL;
}
+void lio_check(struct lio_event *lio)
+{
+ int ret;
+
+ ret = atomic_dec_and_test(&lio->lio_users);
+
+ if (unlikely(ret) && lio->lio_notify.notify != SIGEV_NONE) {
+ /* last one -> notify process */
+ aio_send_signal(&lio->lio_notify);
+ kfree(lio);
+ }
+}
+
+struct lio_event *lio_create(struct sigevent __user *user_event)
+{
+ int ret = 0;
+ struct lio_event *lio = NULL;
+
+ lio = kzalloc(sizeof(*lio), GFP_KERNEL);
+
+ if (!lio)
+ return ERR_PTR(-EAGAIN);
+
+ /*
+ * Grab an initial ref on the lio to avoid races between
+ * submission and completion.
+ */
+ atomic_set(&lio->lio_users, 1);
+
+ lio->lio_notify.notify = SIGEV_NONE;
+
+ if (user_event) {
+ /*
+ * User specified an event for this lio,
+ * he wants to be notified upon lio completion.
+ */
+ ret = aio_setup_sigevent(&lio->lio_notify, user_event);
+
+ if (ret) {
+ kfree(lio);
+ return ERR_PTR(ret);
+ }
+ }
+
+ return lio;
+}
+
/* aio_complete
* Called when the io request on the given iocb is complete.
* Returns true if this is the last user of the request. The
@@ -1057,8 +1105,12 @@ int fastcall aio_complete(struct kiocb *
* when the event got cancelled.
*/
if (kiocbIsCancelled(iocb)) {
+ if (iocb->ki_lio)
+ lio_check(iocb->ki_lio);
+
if (iocb->ki_notify.sigq)
sigqueue_free(iocb->ki_notify.sigq);
+
goto put_rq;
}
@@ -1099,6 +1151,9 @@ int fastcall aio_complete(struct kiocb *
sigqueue_free(iocb->ki_notify.sigq);
}
+ if (iocb->ki_lio)
+ lio_check(iocb->ki_lio);
+
pr_debug("%ld retries: %zd of %zd\n", iocb->ki_retried,
iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes);
put_rq:
@@ -1633,7 +1688,7 @@ static int aio_wake_function(wait_queue_
}
int fastcall io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
- struct iocb *iocb)
+ struct iocb *iocb, struct lio_event *lio)
{
struct kiocb *req;
struct file *file;
@@ -1695,6 +1750,9 @@ int fastcall io_submit_one(struct kioctx
goto out_put_req;
}
+ /* Attach this iocb to its lio */
+ req->ki_lio = lio;
+
ret = aio_setup_iocb(req);
if (ret)
@@ -1738,6 +1796,8 @@ asmlinkage long sys_io_submit(aio_contex
struct iocb __user * __user *iocbpp)
{
struct kioctx *ctx;
+ struct lio_event *lio = NULL;
+ int lio_wait = 0;
long ret = 0;
int i;
@@ -1771,11 +1831,66 @@ asmlinkage long sys_io_submit(aio_contex
break;
}
- ret = io_submit_one(ctx, user_iocb, &tmp);
- if (ret)
- break;
+ if (tmp.aio_lio_opcode == IOCB_CMD_GROUP) {
+
+ /* this command means that all following IO commands
+ * are in the same group.
+ *
+ * Userspace either wants to be notified upon or block until
+ * completion of all the requests in the group.
+ */
+ /*
+ * Ignore an IOCB_CMD_GROUP request if we are already
+ * processing one. This means only one listio per
+ * io_submit call.
+ */
+ if (lio)
+ continue;
+
+ lio = lio_create((struct sigevent __user *)(unsigned long)
+ tmp.aio_sigeventp);
+
+ ret = PTR_ERR(lio);
+
+ if (IS_ERR(lio))
+ goto out_put_ctx;
+
+ if (!tmp.aio_sigeventp)
+ lio_wait = 1;
+ } else {
+ if (lio)
+ atomic_inc(&lio->lio_users);
+
+ ret = io_submit_one(ctx, user_iocb, &tmp, lio);
+
+ if (ret) {
+ if (lio) {
+ /*
+ * If a request failed, just decrement
+ * the users count, but go on submitting
+ * subsequent requests.
+ */
+ atomic_dec(&lio->lio_users);
+ } else
+ break;
+ }
+ }
+ }
+
+ if (lio) {
+ /*
+ * Drop extra ref on the lio now that we're done submitting
+ * requests
+ */
+ lio_check(lio);
+
+ if (lio_wait) {
+ wait_event(ctx->wait, atomic_read(&lio->lio_users)==0);
+ kfree(lio);
+ }
}
+out_put_ctx:
put_ioctx(ctx);
return i ? i : ret;
}
Index: linux-2.6.19-rc6-mm2/include/linux/aio_abi.h
===================================================================
--- linux-2.6.19-rc6-mm2.orig/include/linux/aio_abi.h 2006-11-30 15:06:05.000000000 +0100
+++ linux-2.6.19-rc6-mm2/include/linux/aio_abi.h 2006-11-30 15:27:27.000000000 +0100
@@ -43,6 +43,7 @@ enum {
IOCB_CMD_NOOP = 6,
IOCB_CMD_PREADV = 7,
IOCB_CMD_PWRITEV = 8,
+ IOCB_CMD_GROUP = 9,
};
/* read() from /dev/aio returns these structures. */
Index: linux-2.6.19-rc6-mm2/include/linux/aio.h
===================================================================
--- linux-2.6.19-rc6-mm2.orig/include/linux/aio.h 2006-11-30 15:06:05.000000000 +0100
+++ linux-2.6.19-rc6-mm2/include/linux/aio.h 2006-11-30 15:27:27.000000000 +0100
@@ -58,6 +58,11 @@ struct aio_notify {
struct sigqueue *sigq;
};
+struct lio_event {
+ atomic_t lio_users;
+ struct aio_notify lio_notify;
+};
+
/* is there a better place to document function pointer methods? */
/**
* ki_retry - iocb forward progress callback
@@ -113,6 +118,9 @@ struct kiocb {
wait_queue_t ki_wait;
loff_t ki_pos;
+ /* lio this iocb might be attached to */
+ struct lio_event *ki_lio;
+
void *private;
/* State that we remember to be able to restart/retry */
unsigned short ki_opcode;
@@ -220,12 +228,15 @@ struct mm_struct;
extern void FASTCALL(exit_aio(struct mm_struct *mm));
extern struct kioctx *lookup_ioctx(unsigned long ctx_id);
extern int FASTCALL(io_submit_one(struct kioctx *ctx,
- struct iocb __user *user_iocb, struct iocb *iocb));
+ struct iocb __user *user_iocb, struct iocb *iocb,
+ struct lio_event *lio));
/* semi private, but used by the 32bit emulations: */
struct kioctx *lookup_ioctx(unsigned long ctx_id);
int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
- struct iocb *iocb));
+ struct iocb *iocb, struct lio_event *lio));
+struct lio_event *lio_create(struct sigevent __user *user_event);
+void lio_check(struct lio_event *lio);
#define get_ioctx(kioctx) do { \
BUG_ON(atomic_read(&(kioctx)->users) <= 0); \
Index: linux-2.6.19-rc6-mm2/fs/compat.c
===================================================================
--- linux-2.6.19-rc6-mm2.orig/fs/compat.c 2006-11-30 15:06:05.000000000 +0100
+++ linux-2.6.19-rc6-mm2/fs/compat.c 2006-11-30 15:27:27.000000000 +0100
@@ -646,6 +646,8 @@ asmlinkage long
compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
{
struct kioctx *ctx;
+ struct lio_event *lio = NULL;
+ int lio_wait = 0;
long ret = 0;
int i;
@@ -694,11 +696,65 @@ compat_sys_io_submit(aio_context_t ctx_i
tmp.aio_sigeventp = (__u64)event;
}
- ret = io_submit_one(ctx, user_iocb, &tmp);
- if (ret)
- break;
+ if (tmp.aio_lio_opcode == IOCB_CMD_GROUP) {
+ /* this command means that all following IO commands
+ * are in the same group.
+ *
+ * Userspace either wants to be notified upon or block until
+ * completion of all the requests in the group.
+ */
+ /*
+ * Ignore an IOCB_CMD_GROUP request if we are already
+ * processing one. This means only one listio per
+ * io_submit call.
+ */
+ if (lio)
+ continue;
+
+ lio = lio_create((struct sigevent __user *)(unsigned long)
+ tmp.aio_sigeventp);
+
+ ret = PTR_ERR(lio);
+
+ if (IS_ERR(lio))
+ goto out_put_ctx;
+
+ if (!tmp.aio_sigeventp)
+ lio_wait = 1;
+ } else {
+ if (lio)
+ atomic_inc(&lio->lio_users);
+
+ ret = io_submit_one(ctx, user_iocb, &tmp, lio);
+
+ if (ret) {
+ if (lio) {
+ /*
+ * If a request failed, just decrement
+ * the users count, but go on submitting
+ * subsequent requests.
+ */
+ atomic_dec(&lio->lio_users);
+ } else
+ break;
+ }
+ }
+ }
+
+ if (lio) {
+ /*
+ * Drop extra ref on the lio now that we're done submitting
+ * requests
+ */
+ lio_check(lio);
+
+ if (lio_wait) {
+ wait_event(ctx->wait, atomic_read(&lio->lio_users)==0);
+ kfree(lio);
+ }
}
+out_put_ctx:
put_ioctx(ctx);
return i ? i: ret;
}
AIO completion signal notification
The current 2.6 kernel does not support notification of user space via
an RT signal upon an asynchronous IO completion. The POSIX specification
states that when an AIO request completes, a signal can be delivered to
the application as notification.
This patch adds a struct sigevent *aio_sigeventp to the iocb.
The relevant fields (pid, signal number and value) are stored in the kiocb
for use when the request completes.
That sigevent structure is filled by the application as part of the AIO
request preparation. Upon request completion, the kernel notifies the
application using those sigevent parameters. If SIGEV_NONE has been specified,
then the old behaviour is retained and the application must rely on polling
the completion queue using io_getevents().
A struct sigevent *aio_sigeventp field is added to struct iocb in
include/linux/aio_abi.h
A struct aio_notify containing the sigevent parameters is defined in aio.h:
struct aio_notify {
struct task_struct *target;
__u16 signo;
__u16 notify;
sigval_t value;
};
A struct aio_notify ki_notify is added to struct kiocb in include/linux/aio.h
In io_submit_one(), if the application provided a sigevent then
setup_sigevent() is called which does the following:
- check access to the user sigevent and make a local copy
- if the requested notification is SIGEV_NONE, then nothing to do
- fill in the kiocb->ki_notify fields (notify, signo, value)
- check sigevent consistency, get the signal target task and
save it in kiocb->ki_notify.target
- preallocate a sigqueue for this event using sigqueue_alloc()
Upon request completion, in aio_complete(), if notification is needed for
this request (iocb->ki_notify.notify != SIGEV_NONE), then aio_send_signal()
is called to signal the target task as follows:
- fill in the siginfo struct to be sent to the task
- if notify is SIGEV_THREAD_ID then send signal to specific task
using send_sigqueue()
- else send signal to task group using send_5group_sigqueue()
Notes concerning sigqueue preallocation:
To ensure reliable delivery of completion notification, the sigqueue is
preallocated in the submission path so that there is no chance it can fail
in the completion path.
Unlike the posix-timers case (currently the single other user of sigqueue
preallocation), where the sigqueue is allocated for the lifetime of the timer
and freed at timer destruction time, the aio case is a bit more tricky due to
the async nature of the whole thing.
In the aio case, the sigqueue exists for the lifetime of the request, therefore
it must be freed only once the signal for the request completion has been
delivered. This involves changing __sigqueue_free() to free the sigqueue when the
signal is collected if si_code is SI_ASYNCIO even if it was preallocated as well
as explicitly calling sigqueue_free() in submission and completion error paths.
fs/aio.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++--
fs/compat.c | 18 +++++++
include/linux/aio.h | 12 +++++
include/linux/aio_abi.h | 3 -
kernel/signal.c | 2
5 files changed, 144 insertions(+), 6 deletions(-)
Signed-off-by: S?bastien Dugu? <[email protected]>
Signed-off-by: Laurent Vivier <[email protected]>
Index: linux-2.6.19-rc6-mm2/fs/aio.c
===================================================================
--- linux-2.6.19-rc6-mm2.orig/fs/aio.c 2006-11-30 10:54:16.000000000 +0100
+++ linux-2.6.19-rc6-mm2/fs/aio.c 2006-11-30 15:18:52.000000000 +0100
@@ -416,6 +416,7 @@ static struct kiocb fastcall *__aio_get_
req->ki_dtor = NULL;
req->private = NULL;
req->ki_iovec = NULL;
+ req->ki_notify.sigq = NULL;
INIT_LIST_HEAD(&req->ki_run_list);
/* Check if the completion queue has enough free space to
@@ -463,6 +464,12 @@ static inline void really_put_req(struct
req->ki_dtor(req);
if (req->ki_iovec != &req->ki_inline_vec)
kfree(req->ki_iovec);
+
+ /* Release task ref */
+ if (req->ki_notify.notify == SIGEV_THREAD_ID ||
+ req->ki_notify.notify == SIGEV_SIGNAL)
+ put_task_struct(req->ki_notify.target);
+
kmem_cache_free(kiocb_cachep, req);
ctx->reqs_active--;
@@ -929,6 +936,79 @@ void fastcall kick_iocb(struct kiocb *io
}
EXPORT_SYMBOL(kick_iocb);
+static int aio_send_signal(struct aio_notify *notify)
+{
+ struct sigqueue *sigq = notify->sigq;
+ struct siginfo *info = &sigq->info;
+ int ret;
+
+ memset(info, 0, sizeof(struct siginfo));
+
+ info->si_signo = notify->signo;
+ info->si_errno = 0;
+ info->si_code = SI_ASYNCIO;
+ info->si_pid = 0;
+ info->si_uid = 0;
+ info->si_value = notify->value;
+
+ if (notify->notify & SIGEV_THREAD_ID)
+ ret = send_sigqueue(notify->signo, sigq, notify->target);
+ else
+ ret = send_group_sigqueue(notify->signo, sigq, notify->target);
+
+ return ret;
+}
+
+static long aio_setup_sigevent(struct aio_notify *notify,
+ struct sigevent __user *user_event)
+{
+ sigevent_t event;
+ struct task_struct *target;
+
+ if (copy_from_user(&event, user_event, sizeof (event)))
+ return -EFAULT;
+
+ if (event.sigev_notify == SIGEV_NONE)
+ return 0;
+
+ notify->notify = event.sigev_notify;
+ notify->signo = event.sigev_signo;
+ notify->value = event.sigev_value;
+
+ read_lock(&tasklist_lock);
+ target = good_sigevent(&event);
+
+ if (unlikely(!target || (target->flags & PF_EXITING)))
+ goto out_unlock;
+
+ /*
+ * At this point, we know that notify is either SIGEV_SIGNAL or
+ * SIGEV_THREAD_ID and the target task is valid. So get a reference
+ * on the task, it will be dropped in really_put_req() when
+ * we're done with the request.
+ */
+ get_task_struct(target);
+ notify->target = target;
+ read_unlock(&tasklist_lock);
+
+ /*
+ * NOTE: we cannot free the sigqueue in the completion path as
+ * the signal may not have been delivered to the target task.
+ * Therefore it has to be freed in __sigqueue_free() when the
+ * signal is collected if si_code is SI_ASYNCIO.
+ */
+ notify->sigq = sigqueue_alloc();
+
+ if (unlikely(!notify->sigq))
+ return -EAGAIN;
+
+ return 0;
+
+out_unlock:
+ read_unlock(&tasklist_lock);
+ return -EINVAL;
+}
+
/* aio_complete
* Called when the io request on the given iocb is complete.
* Returns true if this is the last user of the request. The
@@ -976,8 +1056,11 @@ int fastcall aio_complete(struct kiocb *
* cancelled requests don't get events, userland was given one
* when the event got cancelled.
*/
- if (kiocbIsCancelled(iocb))
+ if (kiocbIsCancelled(iocb)) {
+ if (iocb->ki_notify.sigq)
+ sigqueue_free(iocb->ki_notify.sigq);
goto put_rq;
+ }
ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
@@ -1008,6 +1091,14 @@ int fastcall aio_complete(struct kiocb *
pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+ if (iocb->ki_notify.notify != SIGEV_NONE) {
+ ret = aio_send_signal(&iocb->ki_notify);
+
+ /* If signal generation failed, release the sigqueue */
+ if (ret)
+ sigqueue_free(iocb->ki_notify.sigq);
+ }
+
pr_debug("%ld retries: %zd of %zd\n", iocb->ki_retried,
iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes);
put_rq:
@@ -1549,8 +1640,7 @@ int fastcall io_submit_one(struct kioctx
ssize_t ret;
/* enforce forwards compatibility on users */
- if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2 ||
- iocb->aio_reserved3)) {
+ if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved3)) {
pr_debug("EINVAL: io_submit: reserve field set\n");
return -EINVAL;
}
@@ -1559,6 +1649,7 @@ int fastcall io_submit_one(struct kioctx
if (unlikely(
(iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
(iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
+ (iocb->aio_sigeventp != (unsigned long)iocb->aio_sigeventp) ||
((ssize_t)iocb->aio_nbytes < 0)
)) {
pr_debug("EINVAL: io_submit: overflow check\n");
@@ -1593,10 +1684,21 @@ int fastcall io_submit_one(struct kioctx
INIT_LIST_HEAD(&req->ki_wait.task_list);
req->ki_retried = 0;
+ /* handle setting up the sigevent for POSIX AIO signals */
+ req->ki_notify.notify = SIGEV_NONE;
+
+ if (iocb->aio_sigeventp) {
+ ret = aio_setup_sigevent(&req->ki_notify,
+ (struct sigevent __user *)(unsigned long)
+ iocb->aio_sigeventp);
+ if (ret)
+ goto out_put_req;
+ }
+
ret = aio_setup_iocb(req);
if (ret)
- goto out_put_req;
+ goto out_sigqfree;
spin_lock_irq(&ctx->ctx_lock);
aio_run_iocb(req);
@@ -1609,6 +1711,11 @@ int fastcall io_submit_one(struct kioctx
aio_put_req(req); /* drop extra ref to req */
return 0;
+out_sigqfree:
+ /* Undo the sigqueue alloc if someting went bad */
+ if (req->ki_notify.sigq)
+ sigqueue_free(req->ki_notify.sigq);
+
out_put_req:
aio_put_req(req); /* drop extra ref to req */
aio_put_req(req); /* drop i/o ref to req */
Index: linux-2.6.19-rc6-mm2/include/linux/aio_abi.h
===================================================================
--- linux-2.6.19-rc6-mm2.orig/include/linux/aio_abi.h 2006-11-30 10:54:16.000000000 +0100
+++ linux-2.6.19-rc6-mm2/include/linux/aio_abi.h 2006-11-30 15:06:05.000000000 +0100
@@ -82,8 +82,9 @@ struct iocb {
__u64 aio_nbytes;
__s64 aio_offset;
+ __u64 aio_sigeventp; /* pointer to struct sigevent */
+
/* extra parameters */
- __u64 aio_reserved2; /* TODO: use this for a (struct sigevent *) */
__u64 aio_reserved3;
}; /* 64 bytes */
Index: linux-2.6.19-rc6-mm2/include/linux/aio.h
===================================================================
--- linux-2.6.19-rc6-mm2.orig/include/linux/aio.h 2006-11-30 13:18:15.000000000 +0100
+++ linux-2.6.19-rc6-mm2/include/linux/aio.h 2006-11-30 15:06:05.000000000 +0100
@@ -7,6 +7,7 @@
#include <linux/uio.h>
#include <asm/atomic.h>
+#include <asm/siginfo.h>
#define AIO_MAXSEGS 4
#define AIO_KIOGRP_NR_ATOMIC 8
@@ -49,6 +50,14 @@ struct kioctx;
#define kiocbIsKicked(iocb) test_bit(KIF_KICKED, &(iocb)->ki_flags)
#define kiocbIsCancelled(iocb) test_bit(KIF_CANCELLED, &(iocb)->ki_flags)
+struct aio_notify {
+ struct task_struct *target;
+ __u16 signo;
+ __u16 notify;
+ sigval_t value;
+ struct sigqueue *sigq;
+};
+
/* is there a better place to document function pointer methods? */
/**
* ki_retry - iocb forward progress callback
@@ -118,6 +127,9 @@ struct kiocb {
struct list_head ki_list; /* the aio core uses this
* for cancellation */
+
+ /* to notify a process on I/O event */
+ struct aio_notify ki_notify;
};
#define is_sync_kiocb(iocb) ((iocb)->ki_key == KIOCB_SYNC_KEY)
Index: linux-2.6.19-rc6-mm2/kernel/signal.c
===================================================================
--- linux-2.6.19-rc6-mm2.orig/kernel/signal.c 2006-11-30 13:33:10.000000000 +0100
+++ linux-2.6.19-rc6-mm2/kernel/signal.c 2006-11-30 15:06:05.000000000 +0100
@@ -297,7 +297,7 @@ static struct sigqueue *__sigqueue_alloc
static void __sigqueue_free(struct sigqueue *q)
{
- if (q->flags & SIGQUEUE_PREALLOC)
+ if (q->flags & SIGQUEUE_PREALLOC && q->info.si_code != SI_ASYNCIO)
return;
atomic_dec(&q->user->sigpending);
free_uid(q->user);
Index: linux-2.6.19-rc6-mm2/fs/compat.c
===================================================================
--- linux-2.6.19-rc6-mm2.orig/fs/compat.c 2006-11-30 13:12:32.000000000 +0100
+++ linux-2.6.19-rc6-mm2/fs/compat.c 2006-11-30 15:06:05.000000000 +0100
@@ -663,6 +663,7 @@ compat_sys_io_submit(aio_context_t ctx_i
compat_uptr_t uptr;
struct iocb __user *user_iocb;
struct iocb tmp;
+ struct compat_sigevent __user *uevent;
if (unlikely(get_user(uptr, iocb + i))) {
ret = -EFAULT;
@@ -676,6 +677,23 @@ compat_sys_io_submit(aio_context_t ctx_i
break;
}
+ uevent = (struct compat_sigevent __user *)tmp.aio_sigeventp;
+
+ if (uevent) {
+ struct sigevent __user *event = NULL;
+ struct sigevent kevent;
+
+ event = compat_alloc_user_space(sizeof(*event));
+
+ if (get_compat_sigevent(&kevent, uevent) ||
+ copy_to_user(event, &kevent, sizeof(*event))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ tmp.aio_sigeventp = (__u64)event;
+ }
+
ret = io_submit_one(ctx, user_iocb, &tmp);
if (ret)
break;
compat_sys_io_submit() cleanup
Cleanup compat_sys_io_submit by duplicating some of the native syscall
logic in the compat layer and directly calling io_submit_one() instead
of fooling the syscall into thinking it is called from a native 64-bit
caller.
This eliminates:
- the overhead of copying the nr iocb pointers on the userspace stack
- the PAGE_SIZE/(sizeof(void *)) limit on the number of iocbs that
can be submitted.
This is also needed for the completion notification patch to avoid having
to rewrite each iocb on the caller stack for io_submit_one() to find the
sigevents.
compat.c | 61 ++++++++++++++++++++++++++++++++++---------------------------
1 file changed, 34 insertions(+), 27 deletions(-)
Signed-off-by: S?bastien Dugu? <[email protected]>
Index: linux-2.6.19-rc6-mm2/fs/compat.c
===================================================================
--- linux-2.6.19-rc6-mm2.orig/fs/compat.c 2006-11-30 10:00:18.000000000 +0100
+++ linux-2.6.19-rc6-mm2/fs/compat.c 2006-11-30 13:12:32.000000000 +0100
@@ -642,40 +642,47 @@ out:
return ret;
}
-static inline long
-copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
-{
- compat_uptr_t uptr;
- int i;
-
- for (i = 0; i < nr; ++i) {
- if (get_user(uptr, ptr32 + i))
- return -EFAULT;
- if (put_user(compat_ptr(uptr), ptr64 + i))
- return -EFAULT;
- }
- return 0;
-}
-
-#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *))
-
asmlinkage long
compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
{
- struct iocb __user * __user *iocb64;
- long ret;
+ struct kioctx *ctx;
+ long ret = 0;
+ int i;
if (unlikely(nr < 0))
return -EINVAL;
- if (nr > MAX_AIO_SUBMITS)
- nr = MAX_AIO_SUBMITS;
-
- iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
- ret = copy_iocb(nr, iocb, iocb64);
- if (!ret)
- ret = sys_io_submit(ctx_id, nr, iocb64);
- return ret;
+ if (unlikely(!access_ok(VERIFY_READ, iocb, (nr * sizeof(u32)))))
+ return -EFAULT;
+
+ ctx = lookup_ioctx(ctx_id);
+ if (unlikely(!ctx))
+ return -EINVAL;
+
+ for (i=0; i<nr; i++) {
+ compat_uptr_t uptr;
+ struct iocb __user *user_iocb;
+ struct iocb tmp;
+
+ if (unlikely(get_user(uptr, iocb + i))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ user_iocb = compat_ptr(uptr);
+
+ if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = io_submit_one(ctx, user_iocb, &tmp);
+ if (ret)
+ break;
+ }
+
+ put_ioctx(ctx);
+ return i ? i: ret;
}
struct compat_ncp_mount_data {
Make good_sigevent() non-static
Move good_sigevent() from posix-timers.c to signal.c where it belongs,
and make it non-static so that it can be used by other subsystems.
include/linux/signal.h | 1 +
kernel/posix-timers.c | 17 -----------------
kernel/signal.c | 24 ++++++++++++++++++++++++
3 files changed, 25 insertions(+), 17 deletions(-)
Signed-off-by: S?bastien Dugu? <[email protected]>
Index: linux-2.6.19-rc6-mm2/include/linux/signal.h
===================================================================
--- linux-2.6.19-rc6-mm2.orig/include/linux/signal.h 2006-11-30 13:18:33.000000000 +0100
+++ linux-2.6.19-rc6-mm2/include/linux/signal.h 2006-11-30 13:20:13.000000000 +0100
@@ -240,6 +240,7 @@ extern int sigprocmask(int, sigset_t *,
struct pt_regs;
extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie);
+extern struct task_struct * good_sigevent(sigevent_t *);
extern struct kmem_cache *sighand_cachep;
Index: linux-2.6.19-rc6-mm2/kernel/posix-timers.c
===================================================================
--- linux-2.6.19-rc6-mm2.orig/kernel/posix-timers.c 2006-11-30 13:18:33.000000000 +0100
+++ linux-2.6.19-rc6-mm2/kernel/posix-timers.c 2006-11-30 13:20:13.000000000 +0100
@@ -367,23 +367,6 @@ static enum hrtimer_restart posix_timer_
return ret;
}
-static struct task_struct * good_sigevent(sigevent_t * event)
-{
- struct task_struct *rtn = current->group_leader;
-
- if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
- (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) ||
- rtn->tgid != current->tgid ||
- (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
- return NULL;
-
- if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
- ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
- return NULL;
-
- return rtn;
-}
-
void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
{
if ((unsigned) clock_id >= MAX_CLOCKS) {
Index: linux-2.6.19-rc6-mm2/kernel/signal.c
===================================================================
--- linux-2.6.19-rc6-mm2.orig/kernel/signal.c 2006-11-30 13:18:33.000000000 +0100
+++ linux-2.6.19-rc6-mm2/kernel/signal.c 2006-11-30 13:33:10.000000000 +0100
@@ -1189,6 +1189,30 @@ int group_send_sig_info(int sig, struct
return ret;
}
+/***
+ * good_sigevent - check and get target task from a sigevent.
+ * @event: the sigevent to be checked
+ *
+ * This function must be called with the tasklist_lock held for reading.
+ */
+struct task_struct * good_sigevent(sigevent_t * event)
+{
+ struct task_struct *task = current->group_leader;
+
+ if ((event->sigev_notify & SIGEV_THREAD_ID ) == SIGEV_THREAD_ID) {
+ task = find_task_by_pid(event->sigev_notify_thread_id);
+
+ if (!task || task->tgid != current->tgid)
+ return NULL;
+ } else if (event->sigev_notify == SIGEV_SIGNAL) {
+ if ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))
+ return NULL;
+ } else
+ return NULL;
+
+ return task;
+}
+
/*
* kill_pgrp_info() sends a signal to a process group: this is what the tty
* control characters do (^C, ^Z etc)