2012-10-09 06:39:36

by Kent Overstreet

[permalink] [raw]
Subject: [PATCH 1/5] aio: Kill return value of aio_complete()

Nothing used the return value, and it probably wasn't possible to use it
safely for the locked versions (aio_complete(), aio_put_req()). Just
kill it.

Signed-off-by: Kent Overstreet <[email protected]>
---
fs/aio.c | 19 +++++++------------
include/linux/aio.h | 8 ++++----
2 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 71f613c..1ad2d97 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -548,7 +548,7 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
/* __aio_put_req
* Returns true if this put was the last user of the request.
*/
-static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
+static void __aio_put_req(struct kioctx *ctx, struct kiocb *req)
{
dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
req, atomic_long_read(&req->ki_filp->f_count));
@@ -558,7 +558,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
req->ki_users--;
BUG_ON(req->ki_users < 0);
if (likely(req->ki_users))
- return 0;
+ return;
list_del(&req->ki_list); /* remove from active_reqs */
req->ki_cancel = NULL;
req->ki_retry = NULL;
@@ -566,21 +566,18 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
fput(req->ki_filp);
req->ki_filp = NULL;
really_put_req(ctx, req);
- return 1;
}

/* aio_put_req
* Returns true if this put was the last user of the kiocb,
* false if the request is still in use.
*/
-int aio_put_req(struct kiocb *req)
+void aio_put_req(struct kiocb *req)
{
struct kioctx *ctx = req->ki_ctx;
- int ret;
spin_lock_irq(&ctx->ctx_lock);
- ret = __aio_put_req(ctx, req);
+ __aio_put_req(ctx, req);
spin_unlock_irq(&ctx->ctx_lock);
- return ret;
}
EXPORT_SYMBOL(aio_put_req);

@@ -889,7 +886,7 @@ EXPORT_SYMBOL(kick_iocb);
* Returns true if this is the last user of the request. The
* only other user of the request can be the cancellation code.
*/
-int aio_complete(struct kiocb *iocb, long res, long res2)
+void aio_complete(struct kiocb *iocb, long res, long res2)
{
struct kioctx *ctx = iocb->ki_ctx;
struct aio_ring_info *info;
@@ -897,7 +894,6 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
struct io_event *event;
unsigned long flags;
unsigned long tail;
- int ret;

/*
* Special case handling for sync iocbs:
@@ -911,7 +907,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
iocb->ki_user_data = res;
iocb->ki_users = 0;
wake_up_process(iocb->ki_obj.tsk);
- return 1;
+ return;
}

info = &ctx->ring_info;
@@ -973,7 +969,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2)

put_rq:
/* everything turned out well, dispose of the aiocb. */
- ret = __aio_put_req(ctx, iocb);
+ __aio_put_req(ctx, iocb);

/*
* We have to order our ring_info tail store above and test
@@ -987,7 +983,6 @@ put_rq:
wake_up(&ctx->wait);

spin_unlock_irqrestore(&ctx->ctx_lock, flags);
- return ret;
}
EXPORT_SYMBOL(aio_complete);

diff --git a/include/linux/aio.h b/include/linux/aio.h
index 31ff6db..4cde86d 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -211,18 +211,18 @@ extern unsigned aio_max_size;

#ifdef CONFIG_AIO
extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb);
-extern int aio_put_req(struct kiocb *iocb);
+extern void aio_put_req(struct kiocb *iocb);
extern void kick_iocb(struct kiocb *iocb);
-extern int aio_complete(struct kiocb *iocb, long res, long res2);
+extern void aio_complete(struct kiocb *iocb, long res, long res2);
struct mm_struct;
extern void exit_aio(struct mm_struct *mm);
extern long do_io_submit(aio_context_t ctx_id, long nr,
struct iocb __user *__user *iocbpp, bool compat);
#else
static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; }
-static inline int aio_put_req(struct kiocb *iocb) { return 0; }
+static inline void aio_put_req(struct kiocb *iocb) { }
static inline void kick_iocb(struct kiocb *iocb) { }
-static inline int aio_complete(struct kiocb *iocb, long res, long res2) { return 0; }
+static inline void aio_complete(struct kiocb *iocb, long res, long res2) { }
struct mm_struct;
static inline void exit_aio(struct mm_struct *mm) { }
static inline long do_io_submit(aio_context_t ctx_id, long nr,
--
1.7.10.4


2012-10-09 06:39:39

by Kent Overstreet

[permalink] [raw]
Subject: [PATCH 4/5] aio: vmap ringbuffer

It simplifies a lot of stuff if the ringbuffer is contiguously mapped
into kernel space, and we can delete a lot of code - in particular, this
is useful for converting read_events() to cmpxchg.

It'd make more sense if the ringbuffer was allocated with
__get_free_pages() and then mapped into userspace, but I couldn't figure
out how to sanely do that... so vmap works for now.

Signed-off-by: Kent Overstreet <[email protected]>
---
fs/aio.c | 85 +++++++++++++++++----------------------------------
include/linux/aio.h | 7 +++--
2 files changed, 32 insertions(+), 60 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 3ab12f6..c3d97d1 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -82,6 +82,9 @@ static void aio_free_ring(struct kioctx *ctx)
struct aio_ring_info *info = &ctx->ring_info;
long i;

+ if (info->ring)
+ vunmap(info->ring);
+
for (i=0; i<info->nr_pages; i++)
put_page(info->ring_pages[i]);

@@ -99,7 +102,6 @@ static void aio_free_ring(struct kioctx *ctx)

static int aio_setup_ring(struct kioctx *ctx)
{
- struct aio_ring *ring;
struct aio_ring_info *info = &ctx->ring_info;
unsigned nr_events = ctx->max_reqs;
unsigned long size;
@@ -149,46 +151,27 @@ static int aio_setup_ring(struct kioctx *ctx)
return -EAGAIN;
}

+ info->ring = vmap(info->ring_pages, nr_pages, VM_MAP, PAGE_KERNEL);
+ if (!info->ring) {
+ aio_free_ring(ctx);
+ return -ENOMEM;
+ }
+
ctx->user_id = info->mmap_base;

info->nr = nr_events; /* trusted copy */

- ring = kmap_atomic(info->ring_pages[0]);
- ring->nr = nr_events; /* user copy */
- ring->id = ctx->user_id;
- ring->head = ring->tail = 0;
- ring->magic = AIO_RING_MAGIC;
- ring->compat_features = AIO_RING_COMPAT_FEATURES;
- ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
- ring->header_length = sizeof(struct aio_ring);
- kunmap_atomic(ring);
+ info->ring->nr = nr_events; /* user copy */
+ info->ring->id = ctx->user_id;
+ info->ring->head = info->ring->tail = 0;
+ info->ring->magic = AIO_RING_MAGIC;
+ info->ring->compat_features = AIO_RING_COMPAT_FEATURES;
+ info->ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
+ info->ring->header_length = sizeof(struct aio_ring);

return 0;
}

-
-/* aio_ring_event: returns a pointer to the event at the given index from
- * kmap_atomic(). Release the pointer with put_aio_ring_event();
- */
-#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event))
-#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
-#define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
-
-#define aio_ring_event(info, nr) ({ \
- unsigned pos = (nr) + AIO_EVENTS_OFFSET; \
- struct io_event *__event; \
- __event = kmap_atomic( \
- (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE]); \
- __event += pos % AIO_EVENTS_PER_PAGE; \
- __event; \
-})
-
-#define put_aio_ring_event(event) do { \
- struct io_event *__event = (event); \
- (void)__event; \
- kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \
-} while(0)
-
static void free_ioctx(struct work_struct *work)
{
struct kioctx *ctx = container_of(work, struct kioctx, free_work);
@@ -465,7 +448,6 @@ static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
unsigned short allocated, to_alloc;
long avail;
struct kiocb *req, *n;
- struct aio_ring *ring;

to_alloc = min(batch->count, KIOCB_BATCH_SIZE);
for (allocated = 0; allocated < to_alloc; allocated++) {
@@ -480,9 +462,8 @@ static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
goto out;

spin_lock_irq(&ctx->ctx_lock);
- ring = kmap_atomic(ctx->ring_info.ring_pages[0]);

- avail = aio_ring_avail(&ctx->ring_info, ring) - atomic_read(&ctx->reqs_active);
+ avail = aio_ring_avail(&ctx->ring_info) - atomic_read(&ctx->reqs_active);
BUG_ON(avail < 0);
if (avail < allocated) {
/* Trim back the number of requests. */
@@ -500,7 +481,6 @@ static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
atomic_inc(&ctx->reqs_active);
}

- kunmap_atomic(ring);
spin_unlock_irq(&ctx->ctx_lock);

out:
@@ -870,10 +850,9 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
{
struct kioctx *ctx = iocb->ki_ctx;
struct aio_ring_info *info;
- struct aio_ring *ring;
struct io_event *event;
unsigned long flags;
- unsigned long tail;
+ unsigned tail;

/*
* Special case handling for sync iocbs:
@@ -892,7 +871,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)

info = &ctx->ring_info;

- /* add a completion event to the ring buffer.
+ /*
+ * add a completion event to the ring buffer.
* must be done holding ctx->ctx_lock to prevent
* other code from messing with the tail
* pointer since we might be called from irq
@@ -910,10 +890,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
if (kiocbIsCancelled(iocb))
goto put_rq;

- ring = kmap_atomic(info->ring_pages[0]);
-
tail = info->tail;
- event = aio_ring_event(info, tail);
+ event = &info->ring->io_events[tail];
if (++tail >= info->nr)
tail = 0;

@@ -922,9 +900,9 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
event->res = res;
event->res2 = res2;

- dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n",
- ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
- res, res2);
+ pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
+ ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
+ res, res2);

/* after flagging the request as done, we
* must never even look at it again
@@ -932,12 +910,9 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
smp_wmb(); /* make event visible before updating tail */

info->tail = tail;
- ring->tail = tail;
-
- put_aio_ring_event(event);
- kunmap_atomic(ring);
+ info->ring->tail = tail;

- pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+ pr_debug("added to ring %p at [%u]\n", iocb, tail);

/*
* Check if the user asked us to deliver the result through an
@@ -975,11 +950,10 @@ EXPORT_SYMBOL(aio_complete);
static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
{
struct aio_ring_info *info = &ioctx->ring_info;
- struct aio_ring *ring;
+ struct aio_ring *ring = info->ring;
unsigned long head;
int ret = 0;

- ring = kmap_atomic(info->ring_pages[0]);
dprintk("in aio_read_evt h%lu t%lu m%lu\n",
(unsigned long)ring->head, (unsigned long)ring->tail,
(unsigned long)ring->nr);
@@ -991,18 +965,15 @@ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)

head = ring->head % info->nr;
if (head != ring->tail) {
- struct io_event *evp = aio_ring_event(info, head);
- *ent = *evp;
+ *ent = ring->io_events[head];
head = (head + 1) % info->nr;
smp_mb(); /* finish reading the event before updatng the head */
ring->head = head;
ret = 1;
- put_aio_ring_event(evp);
}
spin_unlock(&info->ring_lock);

out:
- kunmap_atomic(ring);
dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret,
(unsigned long)ring->head, (unsigned long)ring->tail);
return ret;
diff --git a/include/linux/aio.h b/include/linux/aio.h
index eb6e5e4..150a4b7 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -161,6 +161,7 @@ struct aio_ring {

#define AIO_RING_PAGES 8
struct aio_ring_info {
+ struct aio_ring *ring;
unsigned long mmap_base;
unsigned long mmap_size;

@@ -173,10 +174,10 @@ struct aio_ring_info {
struct page *internal_pages[AIO_RING_PAGES];
};

-static inline unsigned aio_ring_avail(struct aio_ring_info *info,
- struct aio_ring *ring)
+static inline unsigned aio_ring_avail(struct aio_ring_info *info)
{
- return (ring->head + info->nr - 1 - ring->tail) % info->nr;
+ return (info->ring->head + info->nr - 1 - info->ring->tail) %
+ info->nr;
}

struct kioctx {
--
1.7.10.4

2012-10-09 06:39:55

by Kent Overstreet

[permalink] [raw]
Subject: [PATCH 5/5] aio: Refactor aio_read_evt, use cmxchg(), fix bug

Bunch of cleanup, and make it lockless so that userspace can safely pull
events off the ringbuffer without racing with io_getevents().

Signed-off-by: Kent Overstreet <[email protected]>
---
fs/aio.c | 220 +++++++++++++++++++++-----------------------------------------
1 file changed, 73 insertions(+), 147 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index c3d97d1..1efc8a3 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -259,7 +259,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
atomic_set(&ctx->reqs_active, 1);

spin_lock_init(&ctx->ctx_lock);
- spin_lock_init(&ctx->ring_info.ring_lock);
init_waitqueue_head(&ctx->wait);

INIT_LIST_HEAD(&ctx->active_reqs);
@@ -941,194 +940,121 @@ put_rq:
}
EXPORT_SYMBOL(aio_complete);

-/* aio_read_evt
- * Pull an event off of the ioctx's event ring. Returns the number of
- * events fetched (0 or 1 ;-)
- * FIXME: make this use cmpxchg.
- * TODO: make the ringbuffer user mmap()able (requires FIXME).
+/* aio_read_events
+ * Pull an event off of the ioctx's event ring. Returns the number of
+ * events fetched
*/
-static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
+static int aio_read_events(struct kioctx *ctx, struct io_event __user *event,
+ long nr)
{
- struct aio_ring_info *info = &ioctx->ring_info;
+ struct aio_ring_info *info = &ctx->ring_info;
struct aio_ring *ring = info->ring;
- unsigned long head;
- int ret = 0;
-
- dprintk("in aio_read_evt h%lu t%lu m%lu\n",
- (unsigned long)ring->head, (unsigned long)ring->tail,
- (unsigned long)ring->nr);
-
- if (ring->head == ring->tail)
- goto out;
-
- spin_lock(&info->ring_lock);
+ unsigned old, new;
+ int ret;

- head = ring->head % info->nr;
- if (head != ring->tail) {
- *ent = ring->io_events[head];
- head = (head + 1) % info->nr;
- smp_mb(); /* finish reading the event before updatng the head */
- ring->head = head;
- ret = 1;
- }
- spin_unlock(&info->ring_lock);
+ pr_debug("h%lu t%lu m%lu\n",
+ (unsigned long) ring->head,
+ (unsigned long) ring->tail,
+ (unsigned long) ring->nr);
+retry:
+ ret = 0;
+ old = new = ring->head;

-out:
- dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret,
- (unsigned long)ring->head, (unsigned long)ring->tail);
- return ret;
-}
+ while (ret < nr && new != ring->tail) {
+ struct io_event *ev = &ring->io_events[new];
+ unsigned i = (new < ring->tail ? ring->tail : info->nr) - new;

-struct aio_timeout {
- struct timer_list timer;
- int timed_out;
- struct task_struct *p;
-};
+ i = min_t(int, i, nr - ret);

-static void timeout_func(unsigned long data)
-{
- struct aio_timeout *to = (struct aio_timeout *)data;
+ if (unlikely(copy_to_user(event + ret, ev, sizeof(*ev) * i)))
+ return -EFAULT;

- to->timed_out = 1;
- wake_up_process(to->p);
-}
+ ret += i;
+ new += i;
+ new %= info->nr;
+ }

-static inline void init_timeout(struct aio_timeout *to)
-{
- setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to);
- to->timed_out = 0;
- to->p = current;
-}
+ if (new != old) {
+ smp_mb(); /* finish reading the event before updatng the head */

-static inline void set_timeout(long start_jiffies, struct aio_timeout *to,
- const struct timespec *ts)
-{
- to->timer.expires = start_jiffies + timespec_to_jiffies(ts);
- if (time_after(to->timer.expires, jiffies))
- add_timer(&to->timer);
- else
- to->timed_out = 1;
-}
+ if (cmpxchg(&ring->head, old, new) != old)
+ goto retry;
+ }

-static inline void clear_timeout(struct aio_timeout *to)
-{
- del_singleshot_timer_sync(&to->timer);
+ pr_debug("ret %d h%lu t%lu\n", ret,
+ (unsigned long)ring->head,
+ (unsigned long)ring->tail);
+ return ret;
}

-static int read_events(struct kioctx *ctx,
- long min_nr, long nr,
- struct io_event __user *event,
- struct timespec __user *timeout)
+static int read_events(struct kioctx *ctx, long min_nr, long nr,
+ struct io_event __user *event,
+ struct timespec __user *timeout)
{
- long start_jiffies = jiffies;
- struct task_struct *tsk = current;
- DECLARE_WAITQUEUE(wait, tsk);
- int ret;
- int i = 0;
- struct io_event ent;
- struct aio_timeout to;
- int retry = 0;
-
- /* needed to zero any padding within an entry (there shouldn't be
- * any, but C is fun!
- */
- memset(&ent, 0, sizeof(ent));
+ DEFINE_WAIT(wait);
+ long until = MAX_SCHEDULE_TIMEOUT;
+ size_t i = 0;
+ int ret, retry = 0;
retry:
- ret = 0;
- while (likely(i < nr)) {
- ret = aio_read_evt(ctx, &ent);
- if (unlikely(ret <= 0))
- break;
-
- dprintk("read event: %Lx %Lx %Lx %Lx\n",
- ent.data, ent.obj, ent.res, ent.res2);
-
- /* Could we split the check in two? */
- ret = -EFAULT;
- if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
- dprintk("aio: lost an event due to EFAULT.\n");
- break;
- }
- ret = 0;
-
- /* Good, event copied to userland, update counts. */
- event ++;
- i ++;
- }
+ ret = aio_read_events(ctx, event + i, nr - i);
+ if (ret < 0)
+ return ret;

- if (min_nr <= i)
+ i += ret;
+ if (i >= min_nr)
return i;
- if (ret)
- return ret;

/* End fast path */

/* racey check, but it gets redone */
+ /* XXX: wtf is this for? */
if (!retry && unlikely(!list_empty(&ctx->run_list))) {
retry = 1;
aio_run_all_iocbs(ctx);
goto retry;
}

- init_timeout(&to);
if (timeout) {
struct timespec ts;
- ret = -EFAULT;
if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
- goto out;
+ return -EFAULT;

- set_timeout(start_jiffies, &to, &ts);
+ until = timespec_to_jiffies(&ts);
}

- while (likely(i < nr)) {
- add_wait_queue_exclusive(&ctx->wait, &wait);
- do {
- set_task_state(tsk, TASK_INTERRUPTIBLE);
- ret = aio_read_evt(ctx, &ent);
- if (ret)
- break;
- if (min_nr <= i)
- break;
- if (unlikely(ctx->dead)) {
- ret = -EINVAL;
- break;
- }
- if (to.timed_out) /* Only check after read evt */
- break;
- /* Try to only show up in io wait if there are ops
- * in flight */
- if (atomic_read(&ctx->reqs_active) > 1)
- io_schedule();
- else
- schedule();
- if (signal_pending(tsk)) {
- ret = -EINTR;
- break;
- }
- /*ret = aio_read_evt(ctx, &ent);*/
- } while (1) ;
+ while (i < nr) {
+ prepare_to_wait_exclusive(&ctx->wait, &wait,
+ TASK_INTERRUPTIBLE);

- set_task_state(tsk, TASK_RUNNING);
- remove_wait_queue(&ctx->wait, &wait);
+ ret = aio_read_events(ctx, event + i, nr - i);
+ if (ret < 0)
+ break;

- if (unlikely(ret <= 0))
+ i += ret;
+ if (i >= min_nr)
break;

- ret = -EFAULT;
- if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
- dprintk("aio: lost an event due to EFAULT.\n");
+ if (unlikely(ctx->dead)) {
+ ret = -EINVAL;
break;
}

- /* Good, event copied to userland, update counts. */
- event ++;
- i ++;
+ if (!until) /* Only check after read evt */
+ break;
+
+ /* Try to only show up in io wait if there are ops in flight */
+ if (atomic_read(&ctx->reqs_active) > 1)
+ until = io_schedule_timeout(until);
+ else
+ until = schedule_timeout(until);
+
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
}

- if (timeout)
- clear_timeout(&to);
-out:
- destroy_timer_on_stack(&to.timer);
+ finish_wait(&ctx->wait, &wait);
return i ? i : ret;
}

--
1.7.10.4

2012-10-09 06:40:12

by Kent Overstreet

[permalink] [raw]
Subject: [PATCH 3/5] aio: Rewrite refcounting

The refcounting before wasn't very clear; there are two refcounts in
struct kioctx, with an unclear relationship between them (or between
them and ctx->dead).

Now, reqs_active holds a refcount on users (when reqs_active is
nonzero), and the initial refcount is taken on reqs_active - when
ctx->dead goes to 1, we drop that initial refcount.

Some other semi related cleanup too.

Signed-off-by: Kent Overstreet <[email protected]>
---
fs/aio.c | 187 +++++++++++++++++----------------------------------
include/linux/aio.h | 5 +-
2 files changed, 63 insertions(+), 129 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 95419c4..3ab12f6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -86,8 +86,9 @@ static void aio_free_ring(struct kioctx *ctx)
put_page(info->ring_pages[i]);

if (info->mmap_size) {
- BUG_ON(ctx->mm != current->mm);
- vm_munmap(info->mmap_base, info->mmap_size);
+ down_write(&ctx->mm->mmap_sem);
+ do_munmap(ctx->mm, info->mmap_base, info->mmap_size);
+ up_write(&ctx->mm->mmap_sem);
}

if (info->ring_pages && info->ring_pages != info->internal_pages)
@@ -188,45 +189,37 @@ static int aio_setup_ring(struct kioctx *ctx)
kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \
} while(0)

-static void ctx_rcu_free(struct rcu_head *head)
-{
- struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
- kmem_cache_free(kioctx_cachep, ctx);
-}
-
-/* __put_ioctx
- * Called when the last user of an aio context has gone away,
- * and the struct needs to be freed.
- */
-static void __put_ioctx(struct kioctx *ctx)
+static void free_ioctx(struct work_struct *work)
{
- unsigned nr_events = ctx->max_reqs;
- BUG_ON(ctx->reqs_active);
+ struct kioctx *ctx = container_of(work, struct kioctx, free_work);

cancel_delayed_work_sync(&ctx->wq);
aio_free_ring(ctx);
mmdrop(ctx->mm);
- ctx->mm = NULL;
- if (nr_events) {
- spin_lock(&aio_nr_lock);
- BUG_ON(aio_nr - nr_events > aio_nr);
- aio_nr -= nr_events;
- spin_unlock(&aio_nr_lock);
- }
- pr_debug("__put_ioctx: freeing %p\n", ctx);
- call_rcu(&ctx->rcu_head, ctx_rcu_free);
-}

-static inline int try_get_ioctx(struct kioctx *kioctx)
-{
- return atomic_inc_not_zero(&kioctx->users);
+ spin_lock(&aio_nr_lock);
+ BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
+ aio_nr -= ctx->max_reqs;
+ spin_unlock(&aio_nr_lock);
+
+ synchronize_rcu();
+
+ pr_debug("__put_ioctx: freeing %p\n", ctx);
+ kmem_cache_free(kioctx_cachep, ctx);
}

static inline void put_ioctx(struct kioctx *kioctx)
{
BUG_ON(atomic_read(&kioctx->users) <= 0);
if (unlikely(atomic_dec_and_test(&kioctx->users)))
- __put_ioctx(kioctx);
+ schedule_work(&kioctx->free_work);
+}
+
+static inline void req_put_ioctx(struct kioctx *kioctx)
+{
+ BUG_ON(atomic_read(&kioctx->reqs_active) <= 0);
+ if (unlikely(atomic_dec_and_test(&kioctx->reqs_active)))
+ put_ioctx(kioctx);
}

static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb,
@@ -280,12 +273,15 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
atomic_inc(&mm->mm_count);

atomic_set(&ctx->users, 2);
+ atomic_set(&ctx->reqs_active, 1);
+
spin_lock_init(&ctx->ctx_lock);
spin_lock_init(&ctx->ring_info.ring_lock);
init_waitqueue_head(&ctx->wait);

INIT_LIST_HEAD(&ctx->active_reqs);
INIT_LIST_HEAD(&ctx->run_list);
+ INIT_WORK(&ctx->free_work, free_ioctx);
INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler);

if (aio_setup_ring(ctx) < 0)
@@ -327,36 +323,25 @@ out_freectx:
*/
static void kill_ctx(struct kioctx *ctx)
{
- struct task_struct *tsk = current;
- DECLARE_WAITQUEUE(wait, tsk);
+ struct kiocb *iocb, *t;
struct io_event res;
+ int put = 0;

spin_lock_irq(&ctx->ctx_lock);
- ctx->dead = 1;
- while (!list_empty(&ctx->active_reqs)) {
- struct list_head *pos = ctx->active_reqs.next;
- struct kiocb *iocb = list_kiocb(pos);
- list_del_init(&iocb->ki_list);

- kiocb_cancel(ctx, iocb, &res);
+ if (!ctx->dead) {
+ put = 1;
+ ctx->dead = 1;
+ hlist_del_rcu(&ctx->list);
}

- if (!ctx->reqs_active)
- goto out;
-
- add_wait_queue(&ctx->wait, &wait);
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- while (ctx->reqs_active) {
- spin_unlock_irq(&ctx->ctx_lock);
- io_schedule();
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- spin_lock_irq(&ctx->ctx_lock);
- }
- __set_task_state(tsk, TASK_RUNNING);
- remove_wait_queue(&ctx->wait, &wait);
+ list_for_each_entry_safe(iocb, t, &ctx->active_reqs, ki_list)
+ kiocb_cancel(ctx, iocb, &res);

-out:
spin_unlock_irq(&ctx->ctx_lock);
+
+ if (put)
+ req_put_ioctx(ctx);
}

/* wait_on_sync_kiocb:
@@ -385,18 +370,16 @@ EXPORT_SYMBOL(wait_on_sync_kiocb);
void exit_aio(struct mm_struct *mm)
{
struct kioctx *ctx;
+ struct hlist_node *p;

- while (!hlist_empty(&mm->ioctx_list)) {
- ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
- hlist_del_rcu(&ctx->list);
-
- kill_ctx(ctx);
+ spin_lock(&mm->ioctx_lock);

+ hlist_for_each_entry(ctx, p, &mm->ioctx_list, list) {
if (1 != atomic_read(&ctx->users))
printk(KERN_DEBUG
"exit_aio:ioctx still alive: %d %d %d\n",
atomic_read(&ctx->users), ctx->dead,
- ctx->reqs_active);
+ atomic_read(&ctx->reqs_active));
/*
* We don't need to bother with munmap() here -
* exit_mmap(mm) is coming and it'll unmap everything.
@@ -408,39 +391,34 @@ void exit_aio(struct mm_struct *mm)
* all other callers have ctx->mm == current->mm.
*/
ctx->ring_info.mmap_size = 0;
- put_ioctx(ctx);
+ kill_ctx(ctx);
}
+
+ spin_unlock(&mm->ioctx_lock);
}

/* aio_get_req
- * Allocate a slot for an aio request. Increments the users count
+ * Allocate a slot for an aio request. Increments the ki_users count
* of the kioctx so that the kioctx stays around until all requests are
* complete. Returns NULL if no requests are free.
*
- * Returns with kiocb->users set to 2. The io submit code path holds
+ * Returns with kiocb->ki_users set to 2. The io submit code path holds
* an extra reference while submitting the i/o.
* This prevents races between the aio code path referencing the
* req (after submitting it) and aio_complete() freeing the req.
*/
static struct kiocb *__aio_get_req(struct kioctx *ctx)
{
- struct kiocb *req = NULL;
+ struct kiocb *req;

req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
if (unlikely(!req))
return NULL;

- req->ki_flags = 0;
+ memset(req, 0, sizeof(*req));
req->ki_users = 2;
- req->ki_key = 0;
req->ki_ctx = ctx;
- req->ki_cancel = NULL;
- req->ki_retry = NULL;
- req->ki_dtor = NULL;
- req->private = NULL;
- req->ki_iovec = NULL;
INIT_LIST_HEAD(&req->ki_run_list);
- req->ki_eventfd = NULL;

return req;
}
@@ -473,10 +451,8 @@ static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch)
list_del(&req->ki_batch);
list_del(&req->ki_list);
kmem_cache_free(kiocb_cachep, req);
- ctx->reqs_active--;
+ req_put_ioctx(ctx);
}
- if (unlikely(!ctx->reqs_active && ctx->dead))
- wake_up_all(&ctx->wait);
spin_unlock_irq(&ctx->ctx_lock);
}

@@ -506,7 +482,7 @@ static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
spin_lock_irq(&ctx->ctx_lock);
ring = kmap_atomic(ctx->ring_info.ring_pages[0]);

- avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active;
+ avail = aio_ring_avail(&ctx->ring_info, ring) - atomic_read(&ctx->reqs_active);
BUG_ON(avail < 0);
if (avail < allocated) {
/* Trim back the number of requests. */
@@ -521,7 +497,7 @@ static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
batch->count -= allocated;
list_for_each_entry(req, &batch->head, ki_batch) {
list_add(&req->ki_list, &ctx->active_reqs);
- ctx->reqs_active++;
+ atomic_inc(&ctx->reqs_active);
}

kunmap_atomic(ring);
@@ -546,8 +522,7 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx,

static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
{
- assert_spin_locked(&ctx->ctx_lock);
-
+ fput(req->ki_filp);
if (req->ki_eventfd != NULL)
eventfd_ctx_put(req->ki_eventfd);
if (req->ki_dtor)
@@ -555,10 +530,8 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
if (req->ki_iovec != &req->ki_inline_vec)
kfree(req->ki_iovec);
kmem_cache_free(kiocb_cachep, req);
- ctx->reqs_active--;

- if (unlikely(!ctx->reqs_active && ctx->dead))
- wake_up_all(&ctx->wait);
+ req_put_ioctx(ctx);
}

/* __aio_put_req
@@ -566,8 +539,8 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
*/
static void __aio_put_req(struct kioctx *ctx, struct kiocb *req)
{
- dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
- req, atomic_long_read(&req->ki_filp->f_count));
+ pr_debug("req=%p f_count=%ld\n",
+ req, atomic_long_read(&req->ki_filp->f_count));

assert_spin_locked(&ctx->ctx_lock);

@@ -576,11 +549,7 @@ static void __aio_put_req(struct kioctx *ctx, struct kiocb *req)
if (likely(req->ki_users))
return;
list_del(&req->ki_list); /* remove from active_reqs */
- req->ki_cancel = NULL;
- req->ki_retry = NULL;

- fput(req->ki_filp);
- req->ki_filp = NULL;
really_put_req(ctx, req);
}

@@ -605,18 +574,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)

rcu_read_lock();

- hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
- /*
- * RCU protects us against accessing freed memory but
- * we have to be careful not to get a reference when the
- * reference count already dropped to 0 (ctx->dead test
- * is unreliable because of races).
- */
- if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){
+ hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list)
+ if (ctx->user_id == ctx_id){
+ BUG_ON(ctx->dead);
+ atomic_inc(&ctx->users);
ret = ctx;
break;
}
- }

rcu_read_unlock();
return ret;
@@ -1162,7 +1126,7 @@ retry:
break;
/* Try to only show up in io wait if there are ops
* in flight */
- if (ctx->reqs_active)
+ if (atomic_read(&ctx->reqs_active) > 1)
io_schedule();
else
schedule();
@@ -1197,35 +1161,6 @@ out:
return i ? i : ret;
}

-/* Take an ioctx and remove it from the list of ioctx's. Protects
- * against races with itself via ->dead.
- */
-static void io_destroy(struct kioctx *ioctx)
-{
- struct mm_struct *mm = current->mm;
- int was_dead;
-
- /* delete the entry from the list is someone else hasn't already */
- spin_lock(&mm->ioctx_lock);
- was_dead = ioctx->dead;
- ioctx->dead = 1;
- hlist_del_rcu(&ioctx->list);
- spin_unlock(&mm->ioctx_lock);
-
- dprintk("aio_release(%p)\n", ioctx);
- if (likely(!was_dead))
- put_ioctx(ioctx); /* twice for the list */
-
- kill_ctx(ioctx);
-
- /*
- * Wake up any waiters. The setting of ctx->dead must be seen
- * by other CPUs at this point. Right now, we rely on the
- * locking done by the above calls to ensure this consistency.
- */
- wake_up_all(&ioctx->wait);
-}
-
/* sys_io_setup:
* Create an aio_context capable of receiving at least nr_events.
* ctxp must not point to an aio_context that already exists, and
@@ -1261,7 +1196,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
if (!IS_ERR(ioctx)) {
ret = put_user(ioctx->user_id, ctxp);
if (ret)
- io_destroy(ioctx);
+ kill_ctx(ioctx);
put_ioctx(ioctx);
}

@@ -1279,7 +1214,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
{
struct kioctx *ioctx = lookup_ioctx(ctx);
if (likely(NULL != ioctx)) {
- io_destroy(ioctx);
+ kill_ctx(ioctx);
put_ioctx(ioctx);
return 0;
}
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 4cde86d..eb6e5e4 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -192,7 +192,7 @@ struct kioctx {

spinlock_t ctx_lock;

- int reqs_active;
+ atomic_t reqs_active;
struct list_head active_reqs; /* used for cancellation */
struct list_head run_list; /* used for kicked reqs */

@@ -202,8 +202,7 @@ struct kioctx {
struct aio_ring_info ring_info;

struct delayed_work wq;
-
- struct rcu_head rcu_head;
+ struct work_struct free_work;
};

/* prototypes */
--
1.7.10.4

2012-10-09 06:40:50

by Kent Overstreet

[permalink] [raw]
Subject: [PATCH 2/5] aio: kiocb_cancel()

Minor refactoring, to get rid of some duplicated code

Signed-off-by: Kent Overstreet <[email protected]>
---
fs/aio.c | 72 ++++++++++++++++++++++++++++++++++----------------------------
1 file changed, 39 insertions(+), 33 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 1ad2d97..95419c4 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -229,6 +229,29 @@ static inline void put_ioctx(struct kioctx *kioctx)
__put_ioctx(kioctx);
}

+static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb,
+ struct io_event *res)
+{
+ int (*cancel)(struct kiocb *, struct io_event *);
+ int ret = -EINVAL;
+
+ cancel = kiocb->ki_cancel;
+ kiocbSetCancelled(kiocb);
+ if (cancel) {
+ kiocb->ki_users++;
+ spin_unlock_irq(&ctx->ctx_lock);
+
+ memset(res, 0, sizeof(*res));
+ res->obj = (u64) kiocb->ki_obj.user;
+ res->data = kiocb->ki_user_data;
+ ret = cancel(kiocb, res);
+
+ spin_lock_irq(&ctx->ctx_lock);
+ }
+
+ return ret;
+}
+
/* ioctx_alloc
* Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.
*/
@@ -304,7 +327,6 @@ out_freectx:
*/
static void kill_ctx(struct kioctx *ctx)
{
- int (*cancel)(struct kiocb *, struct io_event *);
struct task_struct *tsk = current;
DECLARE_WAITQUEUE(wait, tsk);
struct io_event res;
@@ -315,14 +337,8 @@ static void kill_ctx(struct kioctx *ctx)
struct list_head *pos = ctx->active_reqs.next;
struct kiocb *iocb = list_kiocb(pos);
list_del_init(&iocb->ki_list);
- cancel = iocb->ki_cancel;
- kiocbSetCancelled(iocb);
- if (cancel) {
- iocb->ki_users++;
- spin_unlock_irq(&ctx->ctx_lock);
- cancel(iocb, &res);
- spin_lock_irq(&ctx->ctx_lock);
- }
+
+ kiocb_cancel(ctx, iocb, &res);
}

if (!ctx->reqs_active)
@@ -1709,7 +1725,7 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
struct io_event __user *, result)
{
- int (*cancel)(struct kiocb *iocb, struct io_event *res);
+ struct io_event res;
struct kioctx *ctx;
struct kiocb *kiocb;
u32 key;
@@ -1724,32 +1740,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
return -EINVAL;

spin_lock_irq(&ctx->ctx_lock);
- ret = -EAGAIN;
+
kiocb = lookup_kiocb(ctx, iocb, key);
- if (kiocb && kiocb->ki_cancel) {
- cancel = kiocb->ki_cancel;
- kiocb->ki_users ++;
- kiocbSetCancelled(kiocb);
- } else
- cancel = NULL;
+ if (kiocb)
+ ret = kiocb_cancel(ctx, kiocb, &res);
+ else
+ ret = -EAGAIN;
+
spin_unlock_irq(&ctx->ctx_lock);

- if (NULL != cancel) {
- struct io_event tmp;
- pr_debug("calling cancel\n");
- memset(&tmp, 0, sizeof(tmp));
- tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user;
- tmp.data = kiocb->ki_user_data;
- ret = cancel(kiocb, &tmp);
- if (!ret) {
- /* Cancellation succeeded -- copy the result
- * into the user's buffer.
- */
- if (copy_to_user(result, &tmp, sizeof(tmp)))
- ret = -EFAULT;
- }
- } else
- ret = -EINVAL;
+ if (!ret) {
+ /* Cancellation succeeded -- copy the result
+ * into the user's buffer.
+ */
+ if (copy_to_user(result, &res, sizeof(res)))
+ ret = -EFAULT;
+ }

put_ioctx(ctx);

--
1.7.10.4

2012-10-09 18:25:43

by Zach Brown

[permalink] [raw]
Subject: Re: [PATCH 1/5] aio: Kill return value of aio_complete()

On Mon, Oct 08, 2012 at 11:39:16PM -0700, Kent Overstreet wrote:
> Nothing used the return value, and it probably wasn't possible to use it
> safely for the locked versions (aio_complete(), aio_put_req()). Just
> kill it.

Nice, seems reasonable enough. (Queue the timer for complaints from out
of tree code?)

Acked-by: Zach Brown <[email protected]>

Though, in the future please cc: aio patches to the maintainers. I'd
have missed this if I wasn't sifting through lkml:

$ ./scripts/get_maintainer.pl -f fs/aio.c
Benjamin LaHaise <[email protected]> (supporter:AIO)
Alexander Viro <[email protected]> (maintainer:FILESYSTEMS (VFS...)
[email protected] (open list:AIO)
[email protected] (open list:FILESYSTEMS (VFS...)
[email protected] (open list)

- z

2012-10-09 18:26:37

by Zach Brown

[permalink] [raw]
Subject: Re: [PATCH 2/5] aio: kiocb_cancel()

On Mon, Oct 08, 2012 at 11:39:17PM -0700, Kent Overstreet wrote:
> Minor refactoring, to get rid of some duplicated code

Honestly: I wouldn't bother. Nothing of consequence uses cancel.

I have an RFC patch series that tears it out. Let me polish that up
send it out, I'll cc: you.

- z

2012-10-09 18:28:07

by Zach Brown

[permalink] [raw]
Subject: Re: [PATCH 3/5] aio: Rewrite refcounting

On Mon, Oct 08, 2012 at 11:39:18PM -0700, Kent Overstreet wrote:
> The refcounting before wasn't very clear; there are two refcounts in
> struct kioctx, with an unclear relationship between them (or between
> them and ctx->dead).
>
> Now, reqs_active holds a refcount on users (when reqs_active is
> nonzero), and the initial refcount is taken on reqs_active - when
> ctx->dead goes to 1, we drop that initial refcount.

I agree that it's a mess, but let's rethink this work on top of the
series I'm sending out that gets rid of the retry and cancel code. It
makes the code a lot easier to follow. (And Jens also has some patches
to take fewer locks in the submission path, we'll want to take them into
account too.)

- z

2012-10-09 18:30:00

by Zach Brown

[permalink] [raw]
Subject: Re: [PATCH 4/5] aio: vmap ringbuffer

On Mon, Oct 08, 2012 at 11:39:19PM -0700, Kent Overstreet wrote:
> It simplifies a lot of stuff if the ringbuffer is contiguously mapped
> into kernel space, and we can delete a lot of code - in particular, this
> is useful for converting read_events() to cmpxchg.

1) I'm concerned that Our Favourite Database Benchmarkers will see an
effect of having more TLB pressure from that mapping. kmap_atomic()
should be cheaper. We'll need to measure the difference.

2) I'm not at all convinced that the current ring semantics are safe
with cmpxchg(), more in the next patch.

- z

2012-10-09 18:38:04

by Zach Brown

[permalink] [raw]
Subject: Re: [PATCH 5/5] aio: Refactor aio_read_evt, use cmxchg(), fix bug

On Mon, Oct 08, 2012 at 11:39:20PM -0700, Kent Overstreet wrote:
> Bunch of cleanup

Ugh. That's way too much noisy change for one patch with no
description. Break it up into functional pieces and actually describe
them.

> events off the ringbuffer without racing with io_getevents().

Are you sure this is safe in the presence of wrapping indices? It's
been a very long time since I've looked at this, but I could never
convince myself that it was safe.

What I'm worried about is cmpxchg()s caller sampling, say, and index of
0, having another task GO NUTS and wrap that index all the way around
back to 0, and then having that initial cmpchg() caller see the wrapped
0 index and think that's nothing's changed in the interim.

Is this a problem?

(I wish I could find the comment I wrote in a very old patch to tear out
the mapped ring entirely.. It was a great list of its design mistakes.)

- z

2012-10-09 21:27:32

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 5/5] aio: Refactor aio_read_evt, use cmxchg(), fix bug

On Tue, Oct 09, 2012 at 11:37:53AM -0700, Zach Brown wrote:
> On Mon, Oct 08, 2012 at 11:39:20PM -0700, Kent Overstreet wrote:
> > Bunch of cleanup
>
> Ugh. That's way too much noisy change for one patch with no
> description. Break it up into functional pieces and actually describe
> them.

Heh, I actually didn't mean to send these patches out to lkml just yet
(though now I'm glad I did). There definitely is too much in this patch,
it kinda snowballed.

> > events off the ringbuffer without racing with io_getevents().
>
> Are you sure this is safe in the presence of wrapping indices? It's
> been a very long time since I've looked at this, but I could never
> convince myself that it was safe.
>
> What I'm worried about is cmpxchg()s caller sampling, say, and index of
> 0, having another task GO NUTS and wrap that index all the way around
> back to 0, and then having that initial cmpchg() caller see the wrapped
> 0 index and think that's nothing's changed in the interim.
>
> Is this a problem?

Ohhhh, yeah it is - ABA. That's easily solved though, we do the head %
nr when we're using the head pointer, and let the stored value use the
full 32 bits.

If libaio is the only thing in userspace looking at the ringbuffer, and
if I'm looking at the latest libaio code this shouldn't break
anything...

> (I wish I could find the comment I wrote in a very old patch to tear out
> the mapped ring entirely.. It was a great list of its design mistakes.)

Sticking the head and tail pointers on the same cacheline is one of my
main complaints. If you could find that comment I'd be interested in
reading it, though.

2012-10-09 21:31:21

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 4/5] aio: vmap ringbuffer

On Tue, Oct 09, 2012 at 11:29:49AM -0700, Zach Brown wrote:
> On Mon, Oct 08, 2012 at 11:39:19PM -0700, Kent Overstreet wrote:
> > It simplifies a lot of stuff if the ringbuffer is contiguously mapped
> > into kernel space, and we can delete a lot of code - in particular, this
> > is useful for converting read_events() to cmpxchg.
>
> 1) I'm concerned that Our Favourite Database Benchmarkers will see an
> effect of having more TLB pressure from that mapping. kmap_atomic()
> should be cheaper. We'll need to measure the difference.

If it is measurable I'll take another stab at using memory from
__get_free_pages() for the ringbuffer. That really would be the ideal
solution.

The other reason I wanted to do this was for the aio attributes stuff -
for return values, I think the only sane way is for the return values to
go in the ringbuffer, which means records are no longer fixed size so
dealing with pages is even more of a pain.

2012-10-09 21:37:05

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 2/5] aio: kiocb_cancel()

On Tue, Oct 09, 2012 at 11:26:25AM -0700, Zach Brown wrote:
> On Mon, Oct 08, 2012 at 11:39:17PM -0700, Kent Overstreet wrote:
> > Minor refactoring, to get rid of some duplicated code
>
> Honestly: I wouldn't bother. Nothing of consequence uses cancel.
>
> I have an RFC patch series that tears it out. Let me polish that up
> send it out, I'll cc: you.

Even better :)

I've been looking at aio locking the past few days, and I was getting
ready to write something up about cancellation to the lists.

Short version, supporting cancellation without global sychronization is
possible but it'd require help from the allocator.

If we can just rip it out though that'd really make me happy.

2012-10-09 22:21:59

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 3/5] aio: Rewrite refcounting

On Tue, Oct 09, 2012 at 11:27:55AM -0700, Zach Brown wrote:
> On Mon, Oct 08, 2012 at 11:39:18PM -0700, Kent Overstreet wrote:
> > The refcounting before wasn't very clear; there are two refcounts in
> > struct kioctx, with an unclear relationship between them (or between
> > them and ctx->dead).
> >
> > Now, reqs_active holds a refcount on users (when reqs_active is
> > nonzero), and the initial refcount is taken on reqs_active - when
> > ctx->dead goes to 1, we drop that initial refcount.
>
> I agree that it's a mess, but let's rethink this work on top of the
> series I'm sending out that gets rid of the retry and cancel code. It
> makes the code a lot easier to follow. (And Jens also has some patches
> to take fewer locks in the submission path, we'll want to take them into
> account too.)

Alright... send it out then. Also, do you know which branch Jens has his
patches in?

2012-10-09 22:32:23

by Zach Brown

[permalink] [raw]
Subject: Re: [PATCH 4/5] aio: vmap ringbuffer

> If it is measurable I'll take another stab at using memory from
> __get_free_pages() for the ringbuffer. That really would be the ideal
> solution.

No, then you'll run into high order allocation failures with rings that
don't fit in a single page.

> The other reason I wanted to do this was for the aio attributes stuff -
> for return values, I think the only sane way is for the return values to
> go in the ringbuffer, which means records are no longer fixed size so
> dealing with pages is even more of a pain.

Then let's see that, please.

And can we please stop calling them attributes? They're inputs and
outputs that change behaviour -- they're interfaces.

And no, just for the record, I don't think generic packed variable size
structs are worth the trouble.

If we're going to do a generic interface extension mechanism then we
should put it in its own well thought out system calls, not staple it on
to the side of aio because it's there. It's a really crummy base to
work from.

- z

2012-10-09 22:35:13

by Zach Brown

[permalink] [raw]
Subject: Re: [PATCH 3/5] aio: Rewrite refcounting

> Alright... send it out then.

Workin' on it! :)

> Also, do you know which branch Jens has his patches in?

http://git.kernel.dk/?p=linux-block.git;a=commit;h=6b6723fc3e4f24dbd80526df935ca115ead578c6

https://plus.google.com/111643045511375507360/posts

As far as I know, he hasn't had a chance to work on it recently.

- z

2012-10-09 22:44:33

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 4/5] aio: vmap ringbuffer

On Tue, Oct 09, 2012 at 03:32:10PM -0700, Zach Brown wrote:
> > If it is measurable I'll take another stab at using memory from
> > __get_free_pages() for the ringbuffer. That really would be the ideal
> > solution.
>
> No, then you'll run into high order allocation failures with rings that
> don't fit in a single page.

Not if we decouple the ringbuffer size from max_requests.

This would be useful to do anyways because right now, allocating a kiocb
has to take a global refcount and check head and tail in the ringbuffer
just so it can avoid overflowing the ringbuffer.

If we change aio_complete() so that if the ringbuffer is full then the
kiocb just goes on a linked list - we can size the ringbuffer so this
doesn't happen normally and avoid the global synchronization in the fast
path.

> > The other reason I wanted to do this was for the aio attributes stuff -
> > for return values, I think the only sane way is for the return values to
> > go in the ringbuffer, which means records are no longer fixed size so
> > dealing with pages is even more of a pain.
>
> Then let's see that, please.

I was starting on that, but then I got sidetracked with refactoring...
:P

> And can we please stop calling them attributes? They're inputs and
> outputs that change behaviour -- they're interfaces.

Attributes isn't a good name but neither is interfaces, because they
don't exist on their own; they're always attached to some other
interface.

I dunno.

> And no, just for the record, I don't think generic packed variable size
> structs are worth the trouble.
>
> If we're going to do a generic interface extension mechanism then we
> should put it in its own well thought out system calls, not staple it on
> to the side of aio because it's there. It's a really crummy base to
> work from.

Not arguing with you about aio, but most of the use cases I have for it
want aio.

So unless we're going to deprecate the existing aio interfaces and make
something better (I wouldn't complain about that!) I do need to make it
work with aio.

Not that I'm opposed to new syscalls passing attributes to sync versions
of read/write/etc, I just haven't started that yet or really thought
about it.

2012-10-09 22:47:14

by Zach Brown

[permalink] [raw]
Subject: Re: [PATCH 5/5] aio: Refactor aio_read_evt, use cmxchg(), fix bug

> If libaio is the only thing in userspace looking at the ringbuffer, and
> if I'm looking at the latest libaio code this shouldn't break
> anything...

We can't assume that libaio is the only thing in userspace using the
mapped buffer -- as scary a thought as that is :).

If we wanted to change the behaviour of the ring we'd give userspace a
way to indicate that it accepts the new semantics. I'm not immediately
sure how best to do that -- haven't given it much thought.

> Sticking the head and tail pointers on the same cacheline is one of my
> main complaints. If you could find that comment I'd be interested in
> reading it, though.

Yeah, that was on my list.. Heh, found it!

/*
* This structure defines the head of a ring of events that is mapped into user
* space. It isn't used anymore because it has a number of design flaws. I'll
* mention them here in the hopes that people read this when considering a
* design for a ring shared between user and kernel space.
*
* - The head and tail pointers are in the same cacheline. This introduces
* false sharing between producers and consumers which could otherwise operate
* independent of one-another, presuming that the producers know ahead of time
* that room has been reserved for them.
*
* - The head and tail semantics are unconventional. They're always less than
* the number of events in the ring and their meaning is reversed from the
* usual construct that one sees in, for example, ethernet hardware where the
* ring is a power of two and the indexes wrap but are masked when used to
* dereference elements.
*
* - Because of the weird head and tail semantics one can't distinguish from
* completely empty and full rings. To work around this the ring is actually
* created with more events than the aio user asked for and that number is
* accounted for separately. The number of free elements in the ring is
* greater than the number of operations that the kernel aio submission will
* allow.
*
* - Synchronizing head and tail updates between the kernel and user space
* requires a u32 cmpxchg. futexes have infrastructure for this which requires
* working with user addresses. Trying to nicely re-use the futex code leads
* to awkward code in aio which sometimes wants to work through kmap()
* addresses and other times want to work through the mmaped user space
* pointers.
*
* - The head and tail pointers are restricted in value to being less than the
* number of elements in the ring. This means that a cmpxchg can see a index
* which has wrapped and confuse it for the index that it read before trying
* cmpxchg. This can end up sending a duplicated previous event instead of a
* currently queued event in the worst case.
*
* - User space doesn't explicitly indicate that it wants to work with the ring
* from user space. Even if user space never touches the ring the kernel still
* has pay the cost of the potential sharing with user space when inserting
* events.
*
* - The kernel magically maps it into the address space. Users can't put it
* in whatever memory they like, like shared mem or hugetlb pages or tmpfs
* pages that they want to sendfile out of.
*
* - The ring is allocated out of unswappable kernel memory. It would be
* better to have the ring allocated in userspace and given to the kernel. In
* the common case the pages will be present and the overhead is minimal and
* complexity is kept out of the kernel. In the worst case of memory pressure
* the kernel has fewer pinned pages tying its hands.
*/

The submission and completion interface that I threw together for the
acall experiments took all this into account:

http://lwn.net/Articles/316806/

Man, this all brings back memories :).

- z

2012-10-09 22:55:17

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 5/5] aio: Refactor aio_read_evt, use cmxchg(), fix bug

On Tue, Oct 09, 2012 at 03:47:03PM -0700, Zach Brown wrote:
> > If libaio is the only thing in userspace looking at the ringbuffer, and
> > if I'm looking at the latest libaio code this shouldn't break
> > anything...
>
> We can't assume that libaio is the only thing in userspace using the
> mapped buffer -- as scary a thought as that is :).
>
> If we wanted to change the behaviour of the ring we'd give userspace a
> way to indicate that it accepts the new semantics. I'm not immediately
> sure how best to do that -- haven't given it much thought.

Well, the ringbuffer does have those compat flags and incompat flags.
Which libaio conveniently doesn't check, but for what it does it
shouldn't really matter I guess.

I figure if anyone else is using the ringbuffer directly and not
checking the flag fields... well, they deserve to have their stuff
broken :P

OTOH, that is the same situation the tracing stuff was in... but the
stuff that used the tracing fields incorrectly was more widespread.

Anyways, if we can't change the ringbuffer at all we could always create
a new version of the io_setup() syscall that gives you a new ringbuffer
format.

That'd arguably be better anyways, since if we went the incompat flags
route old code would be stuck with not being able to use the ringbuffer
at all on newer kernels. But it would be more of a pain and more kernel
code.


> > Sticking the head and tail pointers on the same cacheline is one of my
> > main complaints. If you could find that comment I'd be interested in
> > reading it, though.
>
> Yeah, that was on my list.. Heh, found it!

Thanks!

I'm wondering what interest there is in creating a new aio interface to
solve these and other issues.

I kind of feel like as long as we've got a list of complaints, we should
prototype something in one place that fixes all our complaints... think
of it as documenting all the known issues, if nothing else.

>
> /*
> * This structure defines the head of a ring of events that is mapped into user
> * space. It isn't used anymore because it has a number of design flaws. I'll
> * mention them here in the hopes that people read this when considering a
> * design for a ring shared between user and kernel space.
> *
> * - The head and tail pointers are in the same cacheline. This introduces
> * false sharing between producers and consumers which could otherwise operate
> * independent of one-another, presuming that the producers know ahead of time
> * that room has been reserved for them.
> *
> * - The head and tail semantics are unconventional. They're always less than
> * the number of events in the ring and their meaning is reversed from the
> * usual construct that one sees in, for example, ethernet hardware where the
> * ring is a power of two and the indexes wrap but are masked when used to
> * dereference elements.
> *
> * - Because of the weird head and tail semantics one can't distinguish from
> * completely empty and full rings. To work around this the ring is actually
> * created with more events than the aio user asked for and that number is
> * accounted for separately. The number of free elements in the ring is
> * greater than the number of operations that the kernel aio submission will
> * allow.
> *
> * - Synchronizing head and tail updates between the kernel and user space
> * requires a u32 cmpxchg. futexes have infrastructure for this which requires
> * working with user addresses. Trying to nicely re-use the futex code leads
> * to awkward code in aio which sometimes wants to work through kmap()
> * addresses and other times want to work through the mmaped user space
> * pointers.
> *
> * - The head and tail pointers are restricted in value to being less than the
> * number of elements in the ring. This means that a cmpxchg can see a index
> * which has wrapped and confuse it for the index that it read before trying
> * cmpxchg. This can end up sending a duplicated previous event instead of a
> * currently queued event in the worst case.
> *
> * - User space doesn't explicitly indicate that it wants to work with the ring
> * from user space. Even if user space never touches the ring the kernel still
> * has pay the cost of the potential sharing with user space when inserting
> * events.
> *
> * - The kernel magically maps it into the address space. Users can't put it
> * in whatever memory they like, like shared mem or hugetlb pages or tmpfs
> * pages that they want to sendfile out of.
> *
> * - The ring is allocated out of unswappable kernel memory. It would be
> * better to have the ring allocated in userspace and given to the kernel. In
> * the common case the pages will be present and the overhead is minimal and
> * complexity is kept out of the kernel. In the worst case of memory pressure
> * the kernel has fewer pinned pages tying its hands.
> */
>
> The submission and completion interface that I threw together for the
> acall experiments took all this into account:
>
> http://lwn.net/Articles/316806/
>
> Man, this all brings back memories :).
>
> - z

2012-10-09 22:58:46

by Zach Brown

[permalink] [raw]
Subject: Re: [PATCH 4/5] aio: vmap ringbuffer

> Not if we decouple the ringbuffer size from max_requests.

Hmm, interesting.

> This would be useful to do anyways because right now, allocating a kiocb
> has to take a global refcount and check head and tail in the ringbuffer
> just so it can avoid overflowing the ringbuffer.

I'm not sure what you mean by a 'global refcount'.. do you mean the
per-mm ctx_lock?

> If we change aio_complete() so that if the ringbuffer is full then the
> kiocb just goes on a linked list - we can size the ringbuffer so this
> doesn't happen normally and avoid the global synchronization in the fast
> path.

How would completion events make their way from the list to the ring if
an app is only checking the ring for completions from userspace?

- z

2012-10-09 23:11:10

by Zach Brown

[permalink] [raw]
Subject: Re: [PATCH 5/5] aio: Refactor aio_read_evt, use cmxchg(), fix bug

> Well, the ringbuffer does have those compat flags and incompat flags.
> Which libaio conveniently doesn't check, but for what it does it
> shouldn't really matter I guess.

Well, the presumed point of the incompat flags would be to tell an app
that it isn't going to get what it expects! Ideally it'd abort, not
blindly charge on ahead.

> I figure if anyone else is using the ringbuffer directly and not
> checking the flag fields... well, they deserve to have their stuff
> broken :P

Nope! I subscribe to the unpopular notion that you don't change
interfaces just because you can.

> Anyways, if we can't change the ringbuffer at all we could always create
> a new version of the io_setup() syscall that gives you a new ringbuffer
> format.

That might be the easiest way to tweak the existing aio interface, yeah.
Jens wants to do that in his patches as well. He used the hack of
setting nr_events to INT_MAX to indicate not using the ring, but adding
a flags parameter to a new syscall seems a lot less funky.

> I'm wondering what interest there is in creating a new aio interface to
> solve these and other issues.
>
> I kind of feel like as long as we've got a list of complaints, we should
> prototype something in one place that fixes all our complaints... think
> of it as documenting all the known issues, if nothing else.

I'd help out with that, yes.

On my list of complaints would be how heavy the existing aio
setup/submission/completion/teardown interface is. A better interface
should make it trivial to just bang out a call and synchronously wait
for it. Get that right and you don't have to mess around with aio and
sync variants.

One of the harder things to get right would be specifying the DIF/DIX
checksums per sector. But I think we should. Poor Martin has hung out
to dry for too long.

And perhaps obviously, I'd start with the acall stuff :). It was a lot
lighter. We could talk about how to make it extensible without going
all the way to the generic packed variable size duplicating or not and
returning or not or.. attributes :).

- z

2012-10-10 00:06:09

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 5/5] aio: Refactor aio_read_evt, use cmxchg(), fix bug

On Tue, Oct 09, 2012 at 04:10:59PM -0700, Zach Brown wrote:
> > Well, the ringbuffer does have those compat flags and incompat flags.
> > Which libaio conveniently doesn't check, but for what it does it
> > shouldn't really matter I guess.
>
> Well, the presumed point of the incompat flags would be to tell an app
> that it isn't going to get what it expects! Ideally it'd abort, not
> blindly charge on ahead.
>
> > I figure if anyone else is using the ringbuffer directly and not
> > checking the flag fields... well, they deserve to have their stuff
> > broken :P
>
> Nope! I subscribe to the unpopular notion that you don't change
> interfaces just because you can.

Heh, I won't argue.

The AIO ringbuffer stuff just annoys me more than most (it wasn't until
the other day that I realized it was actually exported to userspace...
what led to figuring that out was noticing aio_context_t was a ulong,
and got truncated to 32 bits with a 32 bit program running on a 64 bit
kernel. I'd been horribly misled by the code comments and the lack of
documentation.)

> > Anyways, if we can't change the ringbuffer at all we could always create
> > a new version of the io_setup() syscall that gives you a new ringbuffer
> > format.
>
> That might be the easiest way to tweak the existing aio interface, yeah.
> Jens wants to do that in his patches as well. He used the hack of
> setting nr_events to INT_MAX to indicate not using the ring, but adding
> a flags parameter to a new syscall seems a lot less funky.

Alright. Maybe I'll start hacking on that...

> > I'm wondering what interest there is in creating a new aio interface to
> > solve these and other issues.
> >
> > I kind of feel like as long as we've got a list of complaints, we should
> > prototype something in one place that fixes all our complaints... think
> > of it as documenting all the known issues, if nothing else.
>
> I'd help out with that, yes.
>
> On my list of complaints would be how heavy the existing aio
> setup/submission/completion/teardown interface is. A better interface
> should make it trivial to just bang out a call and synchronously wait
> for it. Get that right and you don't have to mess around with aio and
> sync variants.

Hmm yeah, setup and teardown is a good point.

I never liked aio_context_t too much - in some respects it would be
cleaner if it was just implicit and per thread. But we probably can't do
that since there are legitimate use cases for one thread submitting and
iocb and another thread reaping the events.

But if we do have an explicit handle, I don't see why it shouldn't be a
file descriptor.

But an implicit per thread context might be useful for the use case you
describe... or perhaps we can add a syscall to submit an iocb and wait
for it synchronously, without any aio_context_t involved.

> One of the harder things to get right would be specifying the DIF/DIX
> checksums per sector. But I think we should. Poor Martin has hung out
> to dry for too long.

Yes, that's one of the things I want to address with the aio attributes
stuff.

>
> And perhaps obviously, I'd start with the acall stuff :). It was a lot
> lighter. We could talk about how to make it extensible without going
> all the way to the generic packed variable size duplicating or not and
> returning or not or.. attributes :).

Link? I haven't heard of acall before.

2012-10-10 00:16:37

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 4/5] aio: vmap ringbuffer

On Tue, Oct 09, 2012 at 03:58:36PM -0700, Zach Brown wrote:
> > Not if we decouple the ringbuffer size from max_requests.
>
> Hmm, interesting.
>
> > This would be useful to do anyways because right now, allocating a kiocb
> > has to take a global refcount and check head and tail in the ringbuffer
> > just so it can avoid overflowing the ringbuffer.
>
> I'm not sure what you mean by a 'global refcount'.. do you mean the
> per-mm ctx_lock?

kioctx->reqs_active. We just have to keep that count somehow if we're
going to avoid overflowing the ringbuffer.

> > If we change aio_complete() so that if the ringbuffer is full then the
> > kiocb just goes on a linked list - we can size the ringbuffer so this
> > doesn't happen normally and avoid the global synchronization in the fast
> > path.
>
> How would completion events make their way from the list to the ring if
> an app is only checking the ring for completions from userspace?

Either they'd have to make a syscall when the ringbuffer is empty- which
should be fine, because at least for most apps all they could do is
sleep or spin. Alternately we could maintain a flag next to the tail
pointer in the ringbuffer that the kernel could maintain, if userspace
wants to be able to avoid that syscall when it's not necessary.

Although - since current libaio skips the syscall if the ringbuffer is
empty, this is yet another thing we can't do with the current
ringbuffer.

Crap.

Well, we should be able to hack around it with the existing
ringbuffer... normally, if the ringbuffer is full and stuff goes on the
list, then there's going to be more completions coming later and stuff
would get pulled off the list then.

The only situation you have to worry about is when the ringbuffer fills
up and stuff goes on the list, and then completions completely stop -
this should be a rare enough situation that maybe we could just hack
around it with a timer that gets flipped on when the list isn't empty.

Also, for this to be an issue at all, _all_ the reaping would have to be
done from userspace - since existing libaio doesn't do that, there may
not be any code out there which triggers it.

2012-10-10 00:17:36

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 3/5] aio: Rewrite refcounting

On Tue, Oct 09, 2012 at 03:35:04PM -0700, Zach Brown wrote:
> > Alright... send it out then.
>
> Workin' on it! :)
>
> > Also, do you know which branch Jens has his patches in?
>
> http://git.kernel.dk/?p=linux-block.git;a=commit;h=6b6723fc3e4f24dbd80526df935ca115ead578c6
>
> https://plus.google.com/111643045511375507360/posts
>
> As far as I know, he hasn't had a chance to work on it recently.

Thanks - wasn't sure if I was looking at the right branch. Looking at it
now...

2012-10-10 00:26:45

by Zach Brown

[permalink] [raw]
Subject: Re: [PATCH 5/5] aio: Refactor aio_read_evt, use cmxchg(), fix bug

> The AIO ringbuffer stuff just annoys me more than most

Not more than everyone, though, I can personally promise you that :).

> (it wasn't until
> the other day that I realized it was actually exported to userspace...
> what led to figuring that out was noticing aio_context_t was a ulong,
> and got truncated to 32 bits with a 32 bit program running on a 64 bit
> kernel. I'd been horribly misled by the code comments and the lack of
> documentation.)

Yeah. It's the userspace address of the mmaped ring. This has annoyed
the process migration people who can't recreate the context in a new
kernel because there's no userspace interface to specify creation of a
context at a specific address.

> But if we do have an explicit handle, I don't see why it shouldn't be a
> file descriptor.

Because they're expensive to create and destroy when compared to a
single system call. Imagine that we're using waiting for a single
completion to implement a cheap one-off sync call. Imagine it's a
buffered op which happens to hit the cache and is really quick.

(And they're annoying to manage: libraries and O_CLOEXEC, running into
fd/file limit tunables, bleh.)

If the 'completion context' is no more than a structure in userspace
memory then a lot of stuff just works. Tasks can share it amongst
themselves as they see fit. A trivial one-off sync call can just dump
it on the stack and point to it. It doesn't have to be specifically
torn down on task exit.

> > And perhaps obviously, I'd start with the acall stuff :). It was a lot
> > lighter. We could talk about how to make it extensible without going
> > all the way to the generic packed variable size duplicating or not and
> > returning or not or.. attributes :).
>
> Link? I haven't heard of acall before.

I linked to it after that giant silly comment earlier in the thread,
here it is again:

http://lwn.net/Articles/316806/

There's a mostly embarassing video of a jetlagged me giving that talk at
LCA kicking around.. ah, here:

http://mirror.linux.org.au/pub/linux.conf.au/2009/Thursday/131.ogg

- z

2012-10-10 00:36:36

by Zach Brown

[permalink] [raw]
Subject: Re: [PATCH 4/5] aio: vmap ringbuffer

> The only situation you have to worry about is when the ringbuffer fills
> up and stuff goes on the list, and then completions completely stop -
> this should be a rare enough situation that maybe we could just hack
> around it with a timer that gets flipped on when the list isn't empty.

Right. And this is when we hopefully realize that we're adding overhead
and complexity (how long's the timer? always fire it? MAKE IT STOP)
and are actually making the system worse, not better.

> Also, for this to be an issue at all, _all_ the reaping would have to be
> done from userspace - since existing libaio doesn't do that, there may
> not be any code out there which triggers it.

And there may be. We default to not breaking interfaces. Seriously.

- z

2012-10-10 00:47:52

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 5/5] aio: Refactor aio_read_evt, use cmxchg(), fix bug

On Tue, Oct 09, 2012 at 05:26:34PM -0700, Zach Brown wrote:
> > The AIO ringbuffer stuff just annoys me more than most
>
> Not more than everyone, though, I can personally promise you that :).
>
> > (it wasn't until
> > the other day that I realized it was actually exported to userspace...
> > what led to figuring that out was noticing aio_context_t was a ulong,
> > and got truncated to 32 bits with a 32 bit program running on a 64 bit
> > kernel. I'd been horribly misled by the code comments and the lack of
> > documentation.)
>
> Yeah. It's the userspace address of the mmaped ring. This has annoyed
> the process migration people who can't recreate the context in a new
> kernel because there's no userspace interface to specify creation of a
> context at a specific address.

Yeah I did finally figure that out - and a file descriptor that
userspace then mmap()ed would solve that problem...

>
> > But if we do have an explicit handle, I don't see why it shouldn't be a
> > file descriptor.
>
> Because they're expensive to create and destroy when compared to a
> single system call. Imagine that we're using waiting for a single
> completion to implement a cheap one-off sync call. Imagine it's a
> buffered op which happens to hit the cache and is really quick.

True. But that could be solved with a separate interface that either
doesn't use a context to submit a call synchronously, or uses an
implicit per thread context.

> (And they're annoying to manage: libraries and O_CLOEXEC, running into
> fd/file limit tunables, bleh.)

I don't have a _strong_ opinion there, but my intuition is that we
shouldn't be creating new types of handles without a good reason. I
don't think the annoyances are for the most part particular to file
descriptors, I think the tend to be applicable to handles in general and
at least with file descriptors they're known and solved.

Also, with a file descriptor it naturally works with an epoll event
loop. (eventfd for aio is a hack).

> If the 'completion context' is no more than a structure in userspace
> memory then a lot of stuff just works. Tasks can share it amongst
> themselves as they see fit. A trivial one-off sync call can just dump
> it on the stack and point to it. It doesn't have to be specifically
> torn down on task exit.

That would be awesome, though for it to be worthwhile there couldn't be
any kernel notion of a context at all and I'm not sure if that's
practical. But the idea hadn't occured to me before and I'm sure you've
thought about it more than I have... hrm.

Oh hey, that's what acall does :P

For completions though you really want the ringbuffer pinned... what do
you do about that?

> > > And perhaps obviously, I'd start with the acall stuff :). It was a lot
> > > lighter. We could talk about how to make it extensible without going
> > > all the way to the generic packed variable size duplicating or not and
> > > returning or not or.. attributes :).
> >
> > Link? I haven't heard of acall before.
>
> I linked to it after that giant silly comment earlier in the thread,
> here it is again:
>
> http://lwn.net/Articles/316806/

Oh whoops, hadn't started reading yet - looking at it now :)

> There's a mostly embarassing video of a jetlagged me giving that talk at
> LCA kicking around.. ah, here:
>
> http://mirror.linux.org.au/pub/linux.conf.au/2009/Thursday/131.ogg
>
> - z

2012-10-10 01:09:39

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 4/5] aio: vmap ringbuffer

On Tue, Oct 09, 2012 at 05:36:26PM -0700, Zach Brown wrote:
> > The only situation you have to worry about is when the ringbuffer fills
> > up and stuff goes on the list, and then completions completely stop -
> > this should be a rare enough situation that maybe we could just hack
> > around it with a timer that gets flipped on when the list isn't empty.
>
> Right. And this is when we hopefully realize that we're adding overhead
> and complexity (how long's the timer? always fire it? MAKE IT STOP)
> and are actually making the system worse, not better.

Can still prototype it, and if it's that ugly... I throw away code all
the time :P

> > Also, for this to be an issue at all, _all_ the reaping would have to be
> > done from userspace - since existing libaio doesn't do that, there may
> > not be any code out there which triggers it.
>
> And there may be. We default to not breaking interfaces. Seriously.

All the more reason to think about it now so we don't screw up the next
interfaces :)

2012-10-10 15:17:37

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH 2/5] aio: kiocb_cancel()

On Tue, Oct 09, 2012 at 02:37:00PM -0700, Kent Overstreet wrote:
> > Honestly: I wouldn't bother. Nothing of consequence uses cancel.
> >
> > I have an RFC patch series that tears it out. Let me polish that up
> > send it out, I'll cc: you.
>
> Even better :)
>
> I've been looking at aio locking the past few days, and I was getting
> ready to write something up about cancellation to the lists.

I can definitely think of use cases (inside of Google) where we could
really use aio_cancel. The issue is it's hard use it safely (i.e., we
need to know whether the file system has already extended the file
before we can know whether or not we can safely cancel the I/O).

> Short version, supporting cancellation without global sychronization is
> possible but it'd require help from the allocator.

Well, we would need some kind of flag to indicate whether cancellation
is possible, yes. But if we're doing AIO to a raw disk, or we're
talking about a read request, we wouldn't need any help from the
file system's block allocator.

And maybe the current way of doing things isn't the best way. But it
would be nice if we didn't completely give up on the functionality of
aio_cancel.

- Ted

2012-10-10 21:21:01

by Zach Brown

[permalink] [raw]
Subject: Re: [PATCH 2/5] aio: kiocb_cancel()

> And maybe the current way of doing things isn't the best way. But it
> would be nice if we didn't completely give up on the functionality of
> aio_cancel.

I sympathize, but the reality is that the current infrastructure
is very bad and no one is using it.

It's not like we're getting rid of the syscall. I'll be behaving
exactly as it does today: returning the error code that indicates that
cancellation failed because it lost the race with completion. Every
caller has to cope with that to use cancel safely.

So if someone eventually implements iocb cancel safely we'll be able to
plumb it back under the aio syscalls. But until that day I see no
reason to carry around buggy infrastructure that is only slowing down
the fast path.

- z

2012-10-10 21:43:26

by Zach Brown

[permalink] [raw]
Subject: Re: [PATCH 5/5] aio: Refactor aio_read_evt, use cmxchg(), fix bug

> True. But that could be solved with a separate interface that either
> doesn't use a context to submit a call synchronously, or uses an
> implicit per thread context.

Sure, but why bother if we can make the one submission interface fast
enough to satisfy quick callers? Less is more, and all that.

> I don't have a _strong_ opinion there, but my intuition is that we
> shouldn't be creating new types of handles without a good reason. I
> don't think the annoyances are for the most part particular to file
> descriptors, I think the tend to be applicable to handles in general and
> at least with file descriptors they're known and solved.

I strongly disagree. That descriptors are an expensive limited
resources is a perfectly good reason to not make them required to access
the ring.

> That would be awesome, though for it to be worthwhile there couldn't be
> any kernel notion of a context at all and I'm not sure if that's
> practical. But the idea hadn't occured to me before and I'm sure you've
> thought about it more than I have... hrm.
>
> Oh hey, that's what acall does :P

:)

> For completions though you really want the ringbuffer pinned... what do
> you do about that?

I don't think the kernel has to mandate that, no. The code has to deal
with completions faulting, but they probably won't. In acall it
happened that completions always came from threads that could block so
its coping mechanism was to just use put_user() :).

If userspace wants them rings locked, they can mlock() the memory.

Think about it from another angle: the current mechanism of creating an
aio ring is a way to allocate pinned memory outside of the usual mlock
accounting. This could be abused, so aio grew an additional tunable to
limit the number of total entries in rings in the system.

By putting the ring in normal user memory we avoid that problem
entirely.

- z

2012-10-10 23:22:03

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH 2/5] aio: kiocb_cancel()

On Wed, Oct 10, 2012 at 02:20:51PM -0700, Zach Brown wrote:
>
> I sympathize, but the reality is that the current infrastructure
> is very bad and no one is using it.
>
> It's not like we're getting rid of the syscall. I'll be behaving
> exactly as it does today: returning the error code that indicates that
> cancellation failed because it lost the race with completion. Every
> caller has to cope with that to use cancel safely.

Yes, I realize it doesn't work today. But hopefully we'll be able to
reimplement it in a cleaner way. That's why I said, I hope we don't
give up on the functionality of aio_cancel --- not that I'm against
removing what we currently have.

Cheers,

- Ted

2012-10-11 02:42:32

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 2/5] aio: kiocb_cancel()

On Wed, Oct 10, 2012 at 07:03:56AM -0400, Theodore Ts'o wrote:
> On Tue, Oct 09, 2012 at 02:37:00PM -0700, Kent Overstreet wrote:
> > > Honestly: I wouldn't bother. Nothing of consequence uses cancel.
> > >
> > > I have an RFC patch series that tears it out. Let me polish that up
> > > send it out, I'll cc: you.
> >
> > Even better :)
> >
> > I've been looking at aio locking the past few days, and I was getting
> > ready to write something up about cancellation to the lists.
>
> I can definitely think of use cases (inside of Google) where we could
> really use aio_cancel. The issue is it's hard use it safely (i.e., we
> need to know whether the file system has already extended the file
> before we can know whether or not we can safely cancel the I/O).
>
> > Short version, supporting cancellation without global sychronization is
> > possible but it'd require help from the allocator.
>
> Well, we would need some kind of flag to indicate whether cancellation
> is possible, yes. But if we're doing AIO to a raw disk, or we're
> talking about a read request, we wouldn't need any help from the
> file system's block allocator.
>
> And maybe the current way of doing things isn't the best way. But it
> would be nice if we didn't completely give up on the functionality of
> aio_cancel.

I thought about this more, and I think ripping out cancellation is a bad
idea in the long term too.

With what aio is capable of now (O_DIRECT), it's certainly arguable that
cancellation doesn't give you much and isn't worth the hassle of
implementing.

But, if we ever fix aio (and I certainly hope we do) and implement
something capable of doing arbitrary IO asynchronously, then we really
are going to need cancellation, because of io to sockets/pipes that can
be blocked for an unbounded amount of time.

Even if it turns out programs don't ever need cancellation in practice,
the real kicker is checkpoint/restore - for checkpoint/restore we've got
to enumerate all the outstanding operations or wait for them to complete
(for the ones that do complete in bounded time), and that's the hard
part of cancellation right there.

Zach might object here and point out that if we implement asynchronous
operations with kernel threads, we can implement cancelling by just
killing the thread. But this is only relevant if we _only_ implement
asynchronous operations with threads in aio v2, and I personally think
that's a lousy idea.

The reason is there's a lot of things that need to be fixed with aio in
aio v2, so aio v2 really needs to be a complete replacement for existing
aio users. It won't be if we start using kernel threads where we weren't
before - kernel threads are cheap but they aren't free, we'll regress on
performance and that means the new interfaces probably won't get used at
all by the existing users.

This isn't a big deal - we can implement aio v2 as a generic async
syscall mechanism, and then the default implementation for any given
syscall will just be the thread pool stuff but use optimized async
implementations for cases where it's available - this'll let us make use
of much of the existing infrastructure.

But it does mean we need a cancellation mechanism that isn't tied to
threads - i.e. does basically what the existing aio cancellation does.

So, IMO we should bite the bullet and do cancellation right now.

Also, I don't think there's anything really terrible about the existing
cancellation mechanism (unlike retrys, that code is... wtf), it's just
problematic for locking/performance. But that's fixable.

It is definitely possible to implement cancellation such that it doesn't
cost anything when it's not being used, but we do need some help from
the allocator.

Say we had a way of iterating over all the slabs in the kiocb cache:
if we pass a constructor to kmem_cache_create() we can guarantee that
all the refcounts are initialized to 0. Combine that with
SLAB_DESTROY_BY_RCU, and we can safely do a try_get_kiocb() when we're
iterating over all the kiocbs looking for the one we want.

The missing piece is a way to iterate over all those slabs, I haven't
been able to find any exposed interface for that.

If we could implement such a mechanism, that would be the way to go (and
I don't think it'd be useful for just aio, this sort of thing comes up
elsewhere. Anything that has to time out operations in particular).

The other option would be to use a simpler dedicated allocator - I have
a simple fast percpu allocator I wrote awhile back for some driver code,
that allocates out of a fixed sized array (I needed to be able to
reference objects by a 16 bit id, i.e. index in the array). I could
easily change that to allocate pages (i.e. slabs) lazily... then we'd
have one of these allocator structs per kioctx so they'd get freed when
the kioctx goes away and we'd have less to look through when we want to
cancel something.

This'd probably be the shortest path, but - while it's no slower than
the existing slab allocators, it doesn't do numa allocation. I don't
know how much that matters on in practice with objects this small.
Besides that, adding another general purpose allocator would IMO be a
little silly.

Either way, IMO the right way to do it is with help from the allocator.

2012-10-11 02:51:40

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 5/5] aio: Refactor aio_read_evt, use cmxchg(), fix bug

On Wed, Oct 10, 2012 at 02:43:15PM -0700, Zach Brown wrote:
> > True. But that could be solved with a separate interface that either
> > doesn't use a context to submit a call synchronously, or uses an
> > implicit per thread context.
>
> Sure, but why bother if we can make the one submission interface fast
> enough to satisfy quick callers? Less is more, and all that.

Very true, if it's possible. I'm just still skeptical.

> > I don't have a _strong_ opinion there, but my intuition is that we
> > shouldn't be creating new types of handles without a good reason. I
> > don't think the annoyances are for the most part particular to file
> > descriptors, I think the tend to be applicable to handles in general and
> > at least with file descriptors they're known and solved.
>
> I strongly disagree. That descriptors are an expensive limited
> resources is a perfectly good reason to not make them required to access
> the ring.

What's so special about aio vs. epoll, and now signalfd/eventfd/timerfd
etc.?

> > That would be awesome, though for it to be worthwhile there couldn't be
> > any kernel notion of a context at all and I'm not sure if that's
> > practical. But the idea hadn't occured to me before and I'm sure you've
> > thought about it more than I have... hrm.
> >
> > Oh hey, that's what acall does :P
>
> :)
>
> > For completions though you really want the ringbuffer pinned... what do
> > you do about that?
>
> I don't think the kernel has to mandate that, no. The code has to deal
> with completions faulting, but they probably won't. In acall it
> happened that completions always came from threads that could block so
> its coping mechanism was to just use put_user() :).

Yeah, but that means the completion has to be delivered from process
context. That's not what aio does today, and it'd be a real performance
regression.

I don't know of a way around that myself.

> If userspace wants them rings locked, they can mlock() the memory.
>
> Think about it from another angle: the current mechanism of creating an
> aio ring is a way to allocate pinned memory outside of the usual mlock
> accounting. This could be abused, so aio grew an additional tunable to
> limit the number of total entries in rings in the system.
>
> By putting the ring in normal user memory we avoid that problem
> entirely.

No different from any other place the kernel allocates memory on behalf
of userspace... it needs a general solution, not a bunch of special case
solutions (though since the general solution is memcg you might argue
the cure is worse than the disease... :P)

2012-10-11 16:43:22

by Zach Brown

[permalink] [raw]
Subject: Re: [PATCH 5/5] aio: Refactor aio_read_evt, use cmxchg(), fix bug

> Yeah, but that means the completion has to be delivered from process
> context. That's not what aio does today, and it'd be a real performance
> regression.

It'd only have to to complete from process context if it faults. The
cheapest possible delivery mechanism is simple cpu stores. In the vast
majority of cases the ring will be resident and it'll be done. In rare
cases it could fall back to a deferred completion. If apps can't
stomach that latency and want to pay the overhead of pinning to remove
that risk, they're welcome to do so.

That's my hope, anyway.

- z