When using a large number of threads performing AIO operations the
IOCTX list may get a significant number of entries which will cause
significant overhead. For example, when running this fio script:
rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
blocksize=1024; numjobs=512; thread; loops=100
on an EXT2 filesystem mounted on top of a ramdisk we can observe up to
30% CPU time spent by lookup_ioctx:
32.51% [guest.kernel] [g] lookup_ioctx
9.19% [guest.kernel] [g] __lock_acquire.isra.28
4.40% [guest.kernel] [g] lock_release
4.19% [guest.kernel] [g] sched_clock_local
3.86% [guest.kernel] [g] local_clock
3.68% [guest.kernel] [g] native_sched_clock
3.08% [guest.kernel] [g] sched_clock_cpu
2.64% [guest.kernel] [g] lock_release_holdtime.part.11
2.60% [guest.kernel] [g] memcpy
2.33% [guest.kernel] [g] lock_acquired
2.25% [guest.kernel] [g] lock_acquire
1.84% [guest.kernel] [g] do_io_submit
This patchs converts the ioctx list to a radix tree. For a performance
comparison the above FIO script was run on a 2 sockets 8 core
machine. This are the results (average and %rsd of 10 runs) for the
original list based implementation and for the radix tree based
implementation:
cores 1 2 4 8 16 32
list 109376 ms 69119 ms 35682 ms 22671 ms 19724 ms 16408 ms
%rsd 0.69% 1.15% 1.17% 1.21% 1.71% 1.43%
radix 73651 ms 41748 ms 23028 ms 16766 ms 15232 ms 13787 ms
%rsd 1.19% 0.98% 0.69% 1.13% 0.72% 0.75%
% of radix
relative 66.12% 65.59% 66.63% 72.31% 77.26% 83.66%
to list
To consider the impact of the patch on the typical case of having
only one ctx per process the following FIO script was run:
rw=randrw; size=100m ;directory=/mnt/fio; ioengine=libaio; iodepth=1
blocksize=1024; numjobs=1; thread; loops=100
on the same system and the results are the following:
list 58892 ms
%rsd 0.91%
radix 59404 ms
%rsd 0.81%
% of radix
relative 100.87%
to list
Cc: Andi Kleen <[email protected]>
Signed-off-by: Octavian Purdila <[email protected]>
---
Changes since v2:
* rebased on top of next/akpm
* add %rsd to measurements
Changes since v1:
* add performance comparision for the typical case of having only one
ctx per process
* use ARRAY_SIZE and drop tracking idx as it is not needed
arch/s390/mm/pgtable.c | 4 +--
fs/aio.c | 76 ++++++++++++++++++++++++++++------------------
include/linux/mm_types.h | 3 +-
kernel/fork.c | 2 +-
4 files changed, 52 insertions(+), 33 deletions(-)
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 2accf71..54186e7 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -894,7 +894,7 @@ int s390_enable_sie(void)
task_lock(tsk);
if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
#ifdef CONFIG_AIO
- !hlist_empty(&tsk->mm->ioctx_list) ||
+ tsk->mm->ioctx_rtree.rnode ||
#endif
tsk->mm != tsk->active_mm) {
task_unlock(tsk);
@@ -921,7 +921,7 @@ int s390_enable_sie(void)
task_lock(tsk);
if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
#ifdef CONFIG_AIO
- !hlist_empty(&tsk->mm->ioctx_list) ||
+ tsk->mm->ioctx_rtree.rnode ||
#endif
tsk->mm != tsk->active_mm) {
mmput(mm);
diff --git a/fs/aio.c b/fs/aio.c
index 5b7ed78..4ceed70 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -38,6 +38,7 @@
#include <linux/blkdev.h>
#include <linux/compat.h>
#include <linux/percpu-refcount.h>
+#include <linux/radix-tree.h>
#include <asm/kmap_types.h>
#include <asm/uaccess.h>
@@ -69,9 +70,7 @@ struct kioctx_cpu {
struct kioctx {
struct percpu_ref users;
- /* This needs improving */
unsigned long user_id;
- struct hlist_node list;
struct __percpu kioctx_cpu *cpu;
@@ -457,10 +456,18 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
aio_nr += ctx->max_reqs;
spin_unlock(&aio_nr_lock);
- /* now link into global list. */
+ /* now insert into the radix tree */
+ err = radix_tree_preload(GFP_KERNEL);
+ if (err)
+ goto out_cleanup;
spin_lock(&mm->ioctx_lock);
- hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
+ err = radix_tree_insert(&mm->ioctx_rtree, ctx->user_id, ctx);
spin_unlock(&mm->ioctx_lock);
+ radix_tree_preload_end();
+ if (err) {
+ WARN_ONCE(1, "aio: insert into ioctx tree failed: %d", err);
+ goto out_cleanup;
+ }
pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
ctx, ctx->user_id, mm, ctx->nr_events);
@@ -501,8 +508,8 @@ static void kill_ioctx_rcu(struct rcu_head *head)
static void kill_ioctx(struct kioctx *ctx)
{
if (percpu_ref_kill(&ctx->users)) {
- hlist_del_rcu(&ctx->list);
- /* Between hlist_del_rcu() and dropping the initial ref */
+ radix_tree_delete(¤t->mm->ioctx_rtree, ctx->user_id);
+ /* Between radix_tree_delete() and dropping the initial ref */
synchronize_rcu();
/*
@@ -542,25 +549,38 @@ EXPORT_SYMBOL(wait_on_sync_kiocb);
*/
void exit_aio(struct mm_struct *mm)
{
- struct kioctx *ctx;
- struct hlist_node *n;
+ struct kioctx *ctx[16];
+ unsigned long idx = 0;
+ int count;
- hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) {
- /*
- * We don't need to bother with munmap() here -
- * exit_mmap(mm) is coming and it'll unmap everything.
- * Since aio_free_ring() uses non-zero ->mmap_size
- * as indicator that it needs to unmap the area,
- * just set it to 0; aio_free_ring() is the only
- * place that uses ->mmap_size, so it's safe.
- */
- ctx->mmap_size = 0;
-
- if (percpu_ref_kill(&ctx->users)) {
- hlist_del_rcu(&ctx->list);
- call_rcu(&ctx->rcu_head, kill_ioctx_rcu);
+ do {
+ int i;
+
+ count = radix_tree_gang_lookup(&mm->ioctx_rtree, (void **)ctx,
+ idx, sizeof(ctx)/sizeof(void *));
+ for (i = 0; i < count; i++) {
+ void *ret;
+
+ BUG_ON(ctx[i]->user_id < idx);
+ idx = ctx[i]->user_id;
+
+ /*
+ * We don't need to bother with munmap() here -
+ * exit_mmap(mm) is coming and it'll unmap everything.
+ * Since aio_free_ring() uses non-zero ->mmap_size
+ * as indicator that it needs to unmap the area,
+ * just set it to 0; aio_free_ring() is the only
+ * place that uses ->mmap_size, so it's safe.
+ */
+ ctx[i]->mmap_size = 0;
+
+ if (percpu_ref_kill(&ctx[i]->users)) {
+ ret = radix_tree_delete(&mm->ioctx_rtree, idx);
+ BUG_ON(!ret || ret != ctx[i]);
+ call_rcu(&ctx[i]->rcu_head, kill_ioctx_rcu);
+ }
}
- }
+ } while (count);
}
static void put_reqs_available(struct kioctx *ctx, unsigned nr)
@@ -665,12 +685,10 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
rcu_read_lock();
- hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
- if (ctx->user_id == ctx_id) {
- percpu_ref_get(&ctx->users);
- ret = ctx;
- break;
- }
+ ctx = radix_tree_lookup(&mm->ioctx_rtree, ctx_id);
+ if (ctx) {
+ percpu_ref_get(&ctx->users);
+ ret = ctx;
}
rcu_read_unlock();
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index fb425aa..1976fa9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/threads.h>
#include <linux/list.h>
+#include <linux/radix-tree.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
#include <linux/rwsem.h>
@@ -383,7 +384,7 @@ struct mm_struct {
struct core_state *core_state; /* coredumping support */
#ifdef CONFIG_AIO
spinlock_t ioctx_lock;
- struct hlist_head ioctx_list;
+ struct radix_tree_root ioctx_rtree;
#endif
#ifdef CONFIG_MM_OWNER
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 6fcc2e2..62e0d0a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -522,7 +522,7 @@ static void mm_init_aio(struct mm_struct *mm)
{
#ifdef CONFIG_AIO
spin_lock_init(&mm->ioctx_lock);
- INIT_HLIST_HEAD(&mm->ioctx_list);
+ INIT_RADIX_TREE(&mm->ioctx_rtree, GFP_KERNEL);
#endif
}
--
1.7.10.4
On Mon, 15 Apr 2013 14:40:55 +0300 Octavian Purdila <[email protected]> wrote:
> When using a large number of threads performing AIO operations the
> IOCTX list may get a significant number of entries which will cause
> significant overhead. For example, when running this fio script:
>
> rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
> blocksize=1024; numjobs=512; thread; loops=100
>
> on an EXT2 filesystem mounted on top of a ramdisk we can observe up to
> 30% CPU time spent by lookup_ioctx:
>
> 32.51% [guest.kernel] [g] lookup_ioctx
> 9.19% [guest.kernel] [g] __lock_acquire.isra.28
> 4.40% [guest.kernel] [g] lock_release
> 4.19% [guest.kernel] [g] sched_clock_local
> 3.86% [guest.kernel] [g] local_clock
> 3.68% [guest.kernel] [g] native_sched_clock
> 3.08% [guest.kernel] [g] sched_clock_cpu
> 2.64% [guest.kernel] [g] lock_release_holdtime.part.11
> 2.60% [guest.kernel] [g] memcpy
> 2.33% [guest.kernel] [g] lock_acquired
> 2.25% [guest.kernel] [g] lock_acquire
> 1.84% [guest.kernel] [g] do_io_submit
>
> This patchs converts the ioctx list to a radix tree.
The patch looks nice. One thing we should pay attention to is the
memory consumption. radix-trees can be far less space-efficient than
lists, and as the tree key comes from mmap() it can be pretty sparsely
distributed.
So could you please have a think about this, see if we can cook up some
worst-case numbers and decide if they are problematic?
On Fri, May 10, 2013 at 01:40:15PM -0700, Andrew Morton wrote:
> On Mon, 15 Apr 2013 14:40:55 +0300 Octavian Purdila <[email protected]> wrote:
>
> > When using a large number of threads performing AIO operations the
> > IOCTX list may get a significant number of entries which will cause
> > significant overhead. For example, when running this fio script:
> >
> > rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
> > blocksize=1024; numjobs=512; thread; loops=100
> >
> > on an EXT2 filesystem mounted on top of a ramdisk we can observe up to
> > 30% CPU time spent by lookup_ioctx:
> >
> > 32.51% [guest.kernel] [g] lookup_ioctx
> > 9.19% [guest.kernel] [g] __lock_acquire.isra.28
> > 4.40% [guest.kernel] [g] lock_release
> > 4.19% [guest.kernel] [g] sched_clock_local
> > 3.86% [guest.kernel] [g] local_clock
> > 3.68% [guest.kernel] [g] native_sched_clock
> > 3.08% [guest.kernel] [g] sched_clock_cpu
> > 2.64% [guest.kernel] [g] lock_release_holdtime.part.11
> > 2.60% [guest.kernel] [g] memcpy
> > 2.33% [guest.kernel] [g] lock_acquired
> > 2.25% [guest.kernel] [g] lock_acquire
> > 1.84% [guest.kernel] [g] do_io_submit
> >
> > This patchs converts the ioctx list to a radix tree.
>
> The patch looks nice. One thing we should pay attention to is the
> memory consumption. radix-trees can be far less space-efficient than
> lists, and as the tree key comes from mmap() it can be pretty sparsely
> distributed.
>
> So could you please have a think about this, see if we can cook up some
> worst-case numbers and decide if they are problematic?
Because the overhead of an ioctx is so high (ringbuffer is some number
of pages) it shouldn't matter much - but I wouldn't mind seeing a bit of
arithmatic.
On Sat, May 11, 2013 at 12:15 AM, Kent Overstreet
<[email protected]> wrote:
> On Fri, May 10, 2013 at 01:40:15PM -0700, Andrew Morton wrote:
>> On Mon, 15 Apr 2013 14:40:55 +0300 Octavian Purdila <[email protected]> wrote:
>>
>> > When using a large number of threads performing AIO operations the
>> > IOCTX list may get a significant number of entries which will cause
>> > significant overhead. For example, when running this fio script:
>> >
>> > rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
>> > blocksize=1024; numjobs=512; thread; loops=100
>> >
>> > on an EXT2 filesystem mounted on top of a ramdisk we can observe up to
>> > 30% CPU time spent by lookup_ioctx:
>> >
>> > 32.51% [guest.kernel] [g] lookup_ioctx
>> > 9.19% [guest.kernel] [g] __lock_acquire.isra.28
>> > 4.40% [guest.kernel] [g] lock_release
>> > 4.19% [guest.kernel] [g] sched_clock_local
>> > 3.86% [guest.kernel] [g] local_clock
>> > 3.68% [guest.kernel] [g] native_sched_clock
>> > 3.08% [guest.kernel] [g] sched_clock_cpu
>> > 2.64% [guest.kernel] [g] lock_release_holdtime.part.11
>> > 2.60% [guest.kernel] [g] memcpy
>> > 2.33% [guest.kernel] [g] lock_acquired
>> > 2.25% [guest.kernel] [g] lock_acquire
>> > 1.84% [guest.kernel] [g] do_io_submit
>> >
>> > This patchs converts the ioctx list to a radix tree.
>>
>> The patch looks nice. One thing we should pay attention to is the
>> memory consumption. radix-trees can be far less space-efficient than
>> lists, and as the tree key comes from mmap() it can be pretty sparsely
>> distributed.
>>
>> So could you please have a think about this, see if we can cook up some
>> worst-case numbers and decide if they are problematic?
>
> Because the overhead of an ioctx is so high (ringbuffer is some number
> of pages) it shouldn't matter much - but I wouldn't mind seeing a bit of
> arithmatic.
The worst case seems to be when we are running on 64 bits and
CONFIG_BASE_SMALL is enable. In that case, if the aio_rings are mapped
dispersed enough (4 bits "space" between mappings, e.g. at 0, 0x400,
0x4000, 0x40000, etc.) then we will have to allocate a full depth
branch, in this case 13 nodes: (64-PAGE_SIZE_BITS)/4.
That adds up to 7384 just for the radix tree, per aio context, with a
radix_tree_node size of 568. Besides that, the minimum memory
allocated for an aio context seems to be 4928 bytes (one page +
sizeof(kioctx)), if I did not miss anything. That looks like a 300%
increase in memory consumption.
However, I am not sure if the worst case is so relevant in practice.
As the number of entries grows the overhead reduces, as some of nodes
can be shared. Also, when the address space is that pathologically
sparse, shouldn't the radix tree used for memory management suffer
from the same problem?
On Mon, Apr 15, 2013 at 02:40:55PM +0300, Octavian Purdila wrote:
> When using a large number of threads performing AIO operations the
> IOCTX list may get a significant number of entries which will cause
> significant overhead. For example, when running this fio script:
>
> rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
> blocksize=1024; numjobs=512; thread; loops=100
>
> on an EXT2 filesystem mounted on top of a ramdisk we can observe up to
> 30% CPU time spent by lookup_ioctx:
>
> 32.51% [guest.kernel] [g] lookup_ioctx
> 9.19% [guest.kernel] [g] __lock_acquire.isra.28
> 4.40% [guest.kernel] [g] lock_release
> 4.19% [guest.kernel] [g] sched_clock_local
> 3.86% [guest.kernel] [g] local_clock
> 3.68% [guest.kernel] [g] native_sched_clock
> 3.08% [guest.kernel] [g] sched_clock_cpu
> 2.64% [guest.kernel] [g] lock_release_holdtime.part.11
> 2.60% [guest.kernel] [g] memcpy
> 2.33% [guest.kernel] [g] lock_acquired
> 2.25% [guest.kernel] [g] lock_acquire
> 1.84% [guest.kernel] [g] do_io_submit
>
> This patchs converts the ioctx list to a radix tree. For a performance
> comparison the above FIO script was run on a 2 sockets 8 core
> machine. This are the results (average and %rsd of 10 runs) for the
> original list based implementation and for the radix tree based
> implementation:
>
> cores 1 2 4 8 16 32
> list 109376 ms 69119 ms 35682 ms 22671 ms 19724 ms 16408 ms
> %rsd 0.69% 1.15% 1.17% 1.21% 1.71% 1.43%
> radix 73651 ms 41748 ms 23028 ms 16766 ms 15232 ms 13787 ms
> %rsd 1.19% 0.98% 0.69% 1.13% 0.72% 0.75%
> % of radix
> relative 66.12% 65.59% 66.63% 72.31% 77.26% 83.66%
> to list
>
> To consider the impact of the patch on the typical case of having
> only one ctx per process the following FIO script was run:
>
> rw=randrw; size=100m ;directory=/mnt/fio; ioengine=libaio; iodepth=1
> blocksize=1024; numjobs=1; thread; loops=100
>
> on the same system and the results are the following:
>
> list 58892 ms
> %rsd 0.91%
> radix 59404 ms
> %rsd 0.81%
> % of radix
> relative 100.87%
> to list
So, I was just doing some benchmarking/profiling to get ready to send
out the aio patches I've got for 3.11 - and it looks like your patch is
causing a ~1.5% throughput regression in my testing :/
I'm just benchmarking random 4k reads with fio, with a single job.
Looking at the profile it appears to all be radix_tree_lookup() - that's
more expensive than I'd expect for a tree with one element.
It's a shame we don't have resizable RCU hash tables, that's really what
we want for this. Actually, I think I might know how to make that work
by using cuckoo hashing...
Might also be worth trying a single element cache of the most recently
used ioctx. Anyways, I don't want to nack your patch over this (the
overhead this is fixing can be quite a bit worse) but I'd like to try
and see if we can fix or reduce the regression in the single ioctx case.
On Wed, Jun 12, 2013 at 11:14:40AM -0700, Kent Overstreet wrote:
> On Mon, Apr 15, 2013 at 02:40:55PM +0300, Octavian Purdila wrote:
> > When using a large number of threads performing AIO operations the
> > IOCTX list may get a significant number of entries which will cause
> > significant overhead. For example, when running this fio script:
> >
> > rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
> > blocksize=1024; numjobs=512; thread; loops=100
> >
> > on an EXT2 filesystem mounted on top of a ramdisk we can observe up to
> > 30% CPU time spent by lookup_ioctx:
> >
> > 32.51% [guest.kernel] [g] lookup_ioctx
> > 9.19% [guest.kernel] [g] __lock_acquire.isra.28
> > 4.40% [guest.kernel] [g] lock_release
> > 4.19% [guest.kernel] [g] sched_clock_local
> > 3.86% [guest.kernel] [g] local_clock
> > 3.68% [guest.kernel] [g] native_sched_clock
> > 3.08% [guest.kernel] [g] sched_clock_cpu
> > 2.64% [guest.kernel] [g] lock_release_holdtime.part.11
> > 2.60% [guest.kernel] [g] memcpy
> > 2.33% [guest.kernel] [g] lock_acquired
> > 2.25% [guest.kernel] [g] lock_acquire
> > 1.84% [guest.kernel] [g] do_io_submit
> >
> > This patchs converts the ioctx list to a radix tree. For a performance
> > comparison the above FIO script was run on a 2 sockets 8 core
> > machine. This are the results (average and %rsd of 10 runs) for the
> > original list based implementation and for the radix tree based
> > implementation:
> >
> > cores 1 2 4 8 16 32
> > list 109376 ms 69119 ms 35682 ms 22671 ms 19724 ms 16408 ms
> > %rsd 0.69% 1.15% 1.17% 1.21% 1.71% 1.43%
> > radix 73651 ms 41748 ms 23028 ms 16766 ms 15232 ms 13787 ms
> > %rsd 1.19% 0.98% 0.69% 1.13% 0.72% 0.75%
> > % of radix
> > relative 66.12% 65.59% 66.63% 72.31% 77.26% 83.66%
> > to list
> >
> > To consider the impact of the patch on the typical case of having
> > only one ctx per process the following FIO script was run:
> >
> > rw=randrw; size=100m ;directory=/mnt/fio; ioengine=libaio; iodepth=1
> > blocksize=1024; numjobs=1; thread; loops=100
> >
> > on the same system and the results are the following:
> >
> > list 58892 ms
> > %rsd 0.91%
> > radix 59404 ms
> > %rsd 0.81%
> > % of radix
> > relative 100.87%
> > to list
>
> So, I was just doing some benchmarking/profiling to get ready to send
> out the aio patches I've got for 3.11 - and it looks like your patch is
> causing a ~1.5% throughput regression in my testing :/
... <snip>
I've got an alternate approach for fixing this wart in lookup_ioctx()...
Instead of using an rbtree, just use the reserved id in the ring buffer
header to index an array pointing the ioctx. It's not finished yet, and
it needs to be tidied up, but is most of the way there.
-ben
--
"Thought is the essence of where you are now."
fs/aio.c | 80 ++++++++++++++++++++++++++++++++++++++++++-----
include/linux/mm_types.h | 5 ++
kernel/fork.c | 4 ++
3 files changed, 81 insertions(+), 8 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index 7fe5bde..96665db 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -108,6 +108,8 @@ struct kioctx {
} ____cacheline_aligned_in_smp;
struct page *internal_pages[AIO_RING_PAGES];
+
+ unsigned id;
};
/*------ sysctl variables----*/
@@ -207,7 +209,7 @@ static int aio_setup_ring(struct kioctx *ctx)
ring = kmap_atomic(ctx->ring_pages[0]);
ring->nr = nr_events; /* user copy */
- ring->id = ctx->user_id;
+ ring->id = ~0U;
ring->head = ring->tail = 0;
ring->magic = AIO_RING_MAGIC;
ring->compat_features = AIO_RING_COMPAT_FEATURES;
@@ -351,6 +353,7 @@ static void put_ioctx(struct kioctx *ctx)
static struct kioctx *ioctx_alloc(unsigned nr_events)
{
struct mm_struct *mm = current->mm;
+ struct aio_ring *ring;
struct kioctx *ctx;
int err = -ENOMEM;
@@ -394,15 +397,61 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
/* now link into global list. */
spin_lock(&mm->ioctx_lock);
+ if (mm->ioctx_nr == 0) {
+ mm->ioctx_nr++;
+ mm->ioctx_table[0] = ctx;
+ ctx->id = 0;
+ } else {
+ unsigned i;
+ if (mm->ioctx_nr == mm->ioctx_max) {
+ unsigned new_nr;
+ struct kioctx **table;
+again:
+ new_nr = mm->ioctx_max * 4;
+ spin_unlock(&mm->ioctx_lock);
+ table = kcalloc(new_nr, sizeof(*table), GFP_KERNEL);
+ if (!table) {
+ err = -ENOMEM;
+ goto out_cleanup_noerr;
+ }
+ spin_lock(&mm->ioctx_lock);
+ if (mm->ioctx_max < new_nr) {
+ struct kioctx **old = mm->ioctx_table;
+ memcpy(table, mm->ioctx_table,
+ mm->ioctx_max * sizeof(struct kioctx *));
+ smp_wmb();
+ mm->ioctx_table = table;
+ smp_wmb();
+ mm->ioctx_max = new_nr;
+ //FIXME: kfree_rcu(old, old);
+ } else
+ kfree(table);
+ if (mm->ioctx_nr == mm->ioctx_max)
+ goto again;
+ }
+ for (i=0; i<mm->ioctx_max; i++) {
+ if (!mm->ioctx_table[i]) {
+ ctx->id = i;
+ mm->ioctx_table[i] = ctx;
+ break;
+ }
+ }
+ WARN_ON(1);
+ }
hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
spin_unlock(&mm->ioctx_lock);
+ ring = kmap_atomic(ctx->ring_pages[0]);
+ ring->id = ctx->id;
+ kunmap_atomic(ring);
+
pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
ctx, ctx->user_id, mm, ctx->nr_events);
return ctx;
out_cleanup:
err = -EAGAIN;
+out_cleanup_noerr:
aio_free_ring(ctx);
out_freectx:
kmem_cache_free(kioctx_cachep, ctx);
@@ -434,7 +483,14 @@ static void kill_ioctx_rcu(struct rcu_head *head)
static void kill_ioctx(struct kioctx *ctx)
{
if (!atomic_xchg(&ctx->dead, 1)) {
+ struct mm_struct *mm = current->mm;
+
+ spin_lock(&mm->ioctx_lock);
+ WARN_ON(ctx != mm->ioctx_table[ctx->id]);
+ mm->ioctx_table[ctx->id] = NULL;
hlist_del_rcu(&ctx->list);
+ spin_unlock(&mm->ioctx_lock);
+
/* Between hlist_del_rcu() and dropping the initial ref */
synchronize_rcu();
@@ -496,6 +552,8 @@ void exit_aio(struct mm_struct *mm)
ctx->mmap_size = 0;
if (!atomic_xchg(&ctx->dead, 1)) {
+ WARN_ON(ctx != mm->ioctx_table[ctx->id]);
+ mm->ioctx_table[ctx->id] = NULL;
hlist_del_rcu(&ctx->list);
call_rcu(&ctx->rcu_head, kill_ioctx_rcu);
}
@@ -557,19 +615,25 @@ EXPORT_SYMBOL(aio_put_req);
static struct kioctx *lookup_ioctx(unsigned long ctx_id)
{
+ struct aio_ring __user *ring = (void __user *)ctx_id;
struct mm_struct *mm = current->mm;
struct kioctx *ctx, *ret = NULL;
+ unsigned id;
+
+ if (get_user(id, &ring->id))
+ return NULL;
rcu_read_lock();
- hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
- if (ctx->user_id == ctx_id) {
- atomic_inc(&ctx->users);
- ret = ctx;
- break;
- }
- }
+ if (id >= mm->ioctx_max)
+ goto out;
+ ctx = mm->ioctx_table[id];
+ if (ctx->user_id == ctx_id) {
+ atomic_inc(&ctx->users);
+ ret = ctx;
+ }
+out:
rcu_read_unlock();
return ret;
}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ace9a5f..6958e55 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -322,6 +322,7 @@ struct mm_rss_stat {
atomic_long_t count[NR_MM_COUNTERS];
};
+struct kioctx;
struct mm_struct {
struct vm_area_struct * mmap; /* list of VMAs */
struct rb_root mm_rb;
@@ -387,6 +388,10 @@ struct mm_struct {
#ifdef CONFIG_AIO
spinlock_t ioctx_lock;
struct hlist_head ioctx_list;
+ unsigned ioctx_nr;
+ unsigned ioctx_max;
+ struct kioctx **ioctx_table;
+ struct kioctx *ioctx_table_inline[1];
#endif
#ifdef CONFIG_MM_OWNER
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 987b28a..5ef351c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -525,6 +525,10 @@ static void mm_init_aio(struct mm_struct *mm)
#ifdef CONFIG_AIO
spin_lock_init(&mm->ioctx_lock);
INIT_HLIST_HEAD(&mm->ioctx_list);
+ mm->ioctx_nr = 0;
+ mm->ioctx_max = 1;
+ mm->ioctx_table = mm->ioctx_table_inline;
+ mm->ioctx_table_inline[0] = NULL;
#endif
}
> I've got an alternate approach for fixing this wart in lookup_ioctx()...
> Instead of using an rbtree, just use the reserved id in the ring buffer
> header to index an array pointing the ioctx. It's not finished yet, and
> it needs to be tidied up, but is most of the way there.
Yeah, that might work.
Note that the patch wasn't using an rbtree, it was storing the pointer
value in a *radix* which is why single lookups took so long. Presumably
radix was used for RCU lookups.
Your ring->id trick lets us use RCU with small ints instead of the
context pointer. It might be worth using idr instead of rolling manual
array code. It'd still be much faster than the list, but get rid of the
large alloc, array walking, memcpy(), etc.
- z
On Wed, Jun 12, 2013 at 10:40 PM, Zach Brown <[email protected]> wrote:
>> I've got an alternate approach for fixing this wart in lookup_ioctx()...
>> Instead of using an rbtree, just use the reserved id in the ring buffer
>> header to index an array pointing the ioctx. It's not finished yet, and
>> it needs to be tidied up, but is most of the way there.
>
> Yeah, that might work.
>
> Note that the patch wasn't using an rbtree, it was storing the pointer
> value in a *radix* which is why single lookups took so long. Presumably
> radix was used for RCU lookups.
>
> Your ring->id trick lets us use RCU with small ints instead of the
> context pointer. It might be worth using idr instead of rolling manual
> array code. It'd still be much faster than the list, but get rid of the
> large alloc, array walking, memcpy(), etc.
>
I picked up Ben's patch and incorporated Zach's idea and the first
results look promising, as expected. I am going to do a full test with
the same workload I've used for rbtree and come back with the results
and the patch in a day or so.
Thanks everybody !
On Fri, Jun 14, 2013 at 5:20 PM, Octavian Purdila
<[email protected]> wrote:
> On Wed, Jun 12, 2013 at 10:40 PM, Zach Brown <[email protected]> wrote:
>>> I've got an alternate approach for fixing this wart in lookup_ioctx()...
>>> Instead of using an rbtree, just use the reserved id in the ring buffer
>>> header to index an array pointing the ioctx. It's not finished yet, and
>>> it needs to be tidied up, but is most of the way there.
>>
>> Yeah, that might work.
>>
>> Note that the patch wasn't using an rbtree, it was storing the pointer
>> value in a *radix* which is why single lookups took so long. Presumably
>> radix was used for RCU lookups.
>>
>> Your ring->id trick lets us use RCU with small ints instead of the
>> context pointer. It might be worth using idr instead of rolling manual
>> array code. It'd still be much faster than the list, but get rid of the
>> large alloc, array walking, memcpy(), etc.
>>
>
> I picked up Ben's patch and incorporated Zach's idea and the first
> results look promising, as expected. I am going to do a full test with
> the same workload I've used for rbtree and come back with the results
> and the patch in a day or so.
>
Unfortunately, I still see performance degradation for the one ioctx
case even when using IDR. I am using the same fio benchmark as before.
On Tue, Jun 18, 2013 at 10:05:22PM +0300, Octavian Purdila wrote:
> On Fri, Jun 14, 2013 at 5:20 PM, Octavian Purdila
> <[email protected]> wrote:
> > I picked up Ben's patch and incorporated Zach's idea and the first
> > results look promising, as expected. I am going to do a full test with
> > the same workload I've used for rbtree and come back with the results
> > and the patch in a day or so.
> >
>
> Unfortunately, I still see performance degradation for the one ioctx
> case even when using IDR. I am using the same fio benchmark as before.
How much of a regression?
-ben
--
"Thought is the essence of where you are now."
On Tue, Jun 18, 2013 at 10:08 PM, Benjamin LaHaise <[email protected]> wrote:
> On Tue, Jun 18, 2013 at 10:05:22PM +0300, Octavian Purdila wrote:
>> On Fri, Jun 14, 2013 at 5:20 PM, Octavian Purdila
>> <[email protected]> wrote:
>> > I picked up Ben's patch and incorporated Zach's idea and the first
>> > results look promising, as expected. I am going to do a full test with
>> > the same workload I've used for rbtree and come back with the results
>> > and the patch in a day or so.
>> >
>>
>> Unfortunately, I still see performance degradation for the one ioctx
>> case even when using IDR. I am using the same fio benchmark as before.
>
> How much of a regression?
>
Hi Ben,
It is even worse then radix, I've seen between 1% to even 4%
degradation. I have tried to profile it and so far the culprit seems
to be the hint code in idr_find. perf top -e dTLB-load-misses shows:
? static inline void *idr_find(struct idr *idr, int id)
? {
? struct idr_layer *hint = rcu_dereference_raw(idr->hint);
?
? if (hint && (id & ~IDR_MASK) == hint->prefix)
? return rcu_dereference_raw(hint->ary[id &
IDR_MASK]);
49.42 ? d0: movzbl %r12b,%edx
? add $0x4,%rdx
? mov 0x8(%rax,%rdx,8),%rax
I should say that I am using kvm for testing this. Hopefully this is
not a side effect of virtualization. I have attached the patch I am
testing with.
Thanks,
Tavi