2009-11-01 16:56:58

by Benjamin LaHaise

[permalink] [raw]
Subject: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups

Use an rbtree in sysfs_dirent to speed up file lookup times

Systems with large numbers (tens of thousands and more) of network
interfaces stress the sysfs code in ways that make the linear search for
a name match take far too long. Avoid this by using an rbtree.

Signed-off-by: Benjamin LaHaise <[email protected]>
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 5fad489..30c3fc5 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -44,6 +44,7 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd)
{
struct sysfs_dirent *parent_sd = sd->s_parent;
struct sysfs_dirent **pos;
+ struct rb_node **new, *parent;

BUG_ON(sd->s_sibling);

@@ -57,6 +58,27 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd)
}
sd->s_sibling = *pos;
*pos = sd;
+
+ // rb tree insert
+ new = &(parent_sd->s_dir.child_rb_root.rb_node);
+ parent = NULL;
+
+ while (*new) {
+ struct sysfs_dirent *this =
+ container_of(*new, struct sysfs_dirent, s_rb_node);
+ int result = strcmp(sd->s_name, this->s_name);
+
+ parent = *new;
+ if (result < 0)
+ new = &((*new)->rb_left);
+ else if (result > 0)
+ new = &((*new)->rb_right);
+ else
+ BUG();
+ }
+
+ rb_link_node(&sd->s_rb_node, parent, new);
+ rb_insert_color(&sd->s_rb_node, &parent_sd->s_dir.child_rb_root);
}

/**
@@ -81,6 +103,8 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
break;
}
}
+
+ rb_erase(&sd->s_rb_node, &sd->s_parent->s_dir.child_rb_root);
}

/**
@@ -331,6 +355,9 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
sd->s_mode = mode;
sd->s_flags = type;

+ if (type == SYSFS_DIR)
+ sd->s_dir.child_rb_root = RB_ROOT;
+
return sd;

err_out2:
@@ -630,11 +657,20 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
const unsigned char *name)
{
- struct sysfs_dirent *sd;
-
- for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling)
- if (!strcmp(sd->s_name, name))
- return sd;
+ struct rb_node *node = parent_sd->s_dir.child_rb_root.rb_node;
+
+ while (node) {
+ struct sysfs_dirent *data =
+ container_of(node, struct sysfs_dirent, s_rb_node);
+ int result;
+ result = strcmp(name, data->s_name);
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return data;
+ }
return NULL;
}

diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index af4c4e7..600109c 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -9,6 +9,7 @@
*/

#include <linux/fs.h>
+#include <linux/rbtree.h>

struct sysfs_open_dirent;

@@ -17,6 +18,7 @@ struct sysfs_elem_dir {
struct kobject *kobj;
/* children list starts here and goes through sd->s_sibling */
struct sysfs_dirent *children;
+ struct rb_root child_rb_root;
};

struct sysfs_elem_symlink {
@@ -52,6 +54,7 @@ struct sysfs_dirent {
atomic_t s_active;
struct sysfs_dirent *s_parent;
struct sysfs_dirent *s_sibling;
+ struct rb_node s_rb_node;
const char *s_name;

union {


2009-11-01 16:56:47

by Benjamin LaHaise

[permalink] [raw]
Subject: [PATCH 2/3] sysfs directory scaling: doubly linked list for dirents

When adding/removing large numbers of entries from sysfs directories, one
hot spot is in the link and unlink operations of sysfs. When linking a
new sysfs_dirent into the tree, operation can be significantly sped up by
starting at the end of the list rather than the beginning. Unlink is
improved by using a doubly linked list.

Signed-off-by: Benjamin LaHaise <[email protected]>
diff -u b/fs/sysfs/dir.c b/fs/sysfs/dir.c
--- b/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -43,11 +43,18 @@
static void sysfs_link_sibling(struct sysfs_dirent *sd)
{
struct sysfs_dirent *parent_sd = sd->s_parent;
- struct sysfs_dirent **pos;
+ struct sysfs_dirent **pos, *prev = NULL;
struct rb_node **new, *parent;

BUG_ON(sd->s_sibling);

+ if (parent_sd->s_dir.children_tail &&
+ parent_sd->s_dir.children_tail->s_ino < sd->s_ino) {
+ prev = parent_sd->s_dir.children_tail;
+ pos = &prev->s_sibling;
+ goto got_it;
+ }
+
/* Store directory entries in order by ino. This allows
* readdir to properly restart without having to add a
* cursor into the s_dir.children list.
@@ -55,8 +62,13 @@
for (pos = &parent_sd->s_dir.children; *pos; pos = &(*pos)->s_sibling) {
if (sd->s_ino < (*pos)->s_ino)
break;
+ prev = *pos;
}
+got_it:
+ if (prev == parent_sd->s_dir.children_tail)
+ parent_sd->s_dir.children_tail = sd;
sd->s_sibling = *pos;
+ sd->s_sibling_prev = prev;
*pos = sd;

// rb tree insert
@@ -93,17 +105,20 @@
*/
static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
{
- struct sysfs_dirent **pos;
-
- for (pos = &sd->s_parent->s_dir.children; *pos;
- pos = &(*pos)->s_sibling) {
- if (*pos == sd) {
- *pos = sd->s_sibling;
- sd->s_sibling = NULL;
- break;
- }
- }
+ struct sysfs_dirent **pos, *prev = NULL;

+ prev = sd->s_sibling_prev;
+ if (prev)
+ pos = &prev->s_sibling;
+ else
+ pos = &sd->s_parent->s_dir.children;
+ if (sd == sd->s_parent->s_dir.children_tail)
+ sd->s_parent->s_dir.children_tail = prev;
+ *pos = sd->s_sibling;
+ if (sd->s_sibling)
+ sd->s_sibling->s_sibling_prev = prev;
+
+ sd->s_sibling_prev = NULL;
rb_erase(&sd->s_rb_node, &sd->s_parent->s_dir.child_rb_root);
}

diff -u b/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
--- b/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -17,7 +17,9 @@
struct sysfs_elem_dir {
struct kobject *kobj;
/* children list starts here and goes through sd->s_sibling */
+
struct sysfs_dirent *children;
+ struct sysfs_dirent *children_tail;
struct rb_root child_rb_root;
};

@@ -54,6 +56,7 @@
atomic_t s_active;
struct sysfs_dirent *s_parent;
struct sysfs_dirent *s_sibling;
+ struct sysfs_dirent *s_sibling_prev;
struct rb_node s_rb_node;
const char *s_name;

2009-11-01 16:56:45

by Benjamin LaHaise

[permalink] [raw]
Subject: [PATCH 2/3] sysfs directory scaling: count number of children dirs

sysfs_count_nlink() is a bottleneck during mass bring up of network
interfaces. Eliminate this problem by tracking the number of directories.

Signed-off-by: Benjamin LaHaise <[email protected]>
diff -u b/fs/sysfs/dir.c b/fs/sysfs/dir.c
--- b/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -70,6 +70,7 @@
sd->s_sibling = *pos;
sd->s_sibling_prev = prev;
*pos = sd;
+ parent_sd->s_nr_children_dir += (sysfs_type(sd) == SYSFS_DIR);

// rb tree insert
new = &(parent_sd->s_dir.child_rb_root.rb_node);
@@ -118,6 +119,7 @@
if (sd->s_sibling)
sd->s_sibling->s_sibling_prev = prev;

+ sd->s_parent->s_nr_children_dir -= (sysfs_type(sd) == SYSFS_DIR);
sd->s_sibling_prev = NULL;
rb_erase(&sd->s_rb_node, &sd->s_parent->s_dir.child_rb_root);
}
diff -u b/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
--- b/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -71,6 +71,8 @@
ino_t s_ino;
umode_t s_mode;
struct sysfs_inode_attrs *s_iattr;
+
+ int s_nr_children_dir;
};

#define SD_DEACTIVATED_BIAS INT_MIN
only in patch2:
unchanged:
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -191,14 +191,7 @@ static struct lock_class_key sysfs_inode_imutex_key;

static int sysfs_count_nlink(struct sysfs_dirent *sd)
{
- struct sysfs_dirent *child;
- int nr = 0;
-
- for (child = sd->s_dir.children; child; child = child->s_sibling)
- if (sysfs_type(child) == SYSFS_DIR)
- nr++;
-
- return nr + 2;
+ return sd->s_nr_children_dir + 2;
}

static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)

2009-11-03 03:58:17

by Greg KH

[permalink] [raw]
Subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups

On Sun, Nov 01, 2009 at 11:31:30AM -0500, Benjamin LaHaise wrote:
> Use an rbtree in sysfs_dirent to speed up file lookup times
>
> Systems with large numbers (tens of thousands and more) of network
> interfaces stress the sysfs code in ways that make the linear search for
> a name match take far too long. Avoid this by using an rbtree.

What kind of speedups are you seeing here? And do these changes cause a
memory increase due to the structure changes which outweigh the
speedups?

What kind of test are you doing to reproduce this?

thanks,

greg k-h

2009-11-03 06:14:44

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups

Greg KH a ?crit :
> On Sun, Nov 01, 2009 at 11:31:30AM -0500, Benjamin LaHaise wrote:
>> Use an rbtree in sysfs_dirent to speed up file lookup times
>>
>> Systems with large numbers (tens of thousands and more) of network
>> interfaces stress the sysfs code in ways that make the linear search for
>> a name match take far too long. Avoid this by using an rbtree.
>
> What kind of speedups are you seeing here? And do these changes cause a
> memory increase due to the structure changes which outweigh the
> speedups?
>
> What kind of test are you doing to reproduce this?
>

Its curious because in my tests the biggest problems come from
kernel/sysctl.c (__register_sysctl_paths) consuming 80% of cpu
in following attempt to create 20.000 devices

(disable hotplug before trying this, and ipv6 too !)
modprobe dummy numdummies=20000

I believe we should address __register_sysctl_paths() scalability
problems too.

I dont know what is the 'sentinel' we allocate after each struct ctl_table
But I suspect we could reduce size requirement of the 'sentinel' to include
only needed fields for the sentinel (and move them at start of ctl_table)

/*
* For each path component, allocate a 2-element ctl_table array.
* The first array element will be filled with the sysctl entry
* for this, the second will be the sentinel (ctl_name == 0).
*
* We allocate everything in one go so that we don't have to
* worry about freeing additional memory in unregister_sysctl_table.
*/
header = kzalloc(sizeof(struct ctl_table_header) +
(2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);

Then, adding an rb_node in ctl_table_header to speedup __register_sysctl_paths() a bit

2009-11-03 07:01:29

by Eric Dumazet

[permalink] [raw]
Subject: [PATCH] sysctl: reduce ram usage by 40 %

Eric Dumazet a ?crit :

> Its curious because in my tests the biggest problems come from
> kernel/sysctl.c (__register_sysctl_paths) consuming 80% of cpu
> in following attempt to create 20.000 devices
>
> (disable hotplug before trying this, and ipv6 too !)
> modprobe dummy numdummies=20000
>
> I believe we should address __register_sysctl_paths() scalability
> problems too.
>
> I dont know what is the 'sentinel' we allocate after each struct ctl_table
> But I suspect we could reduce size requirement of the 'sentinel' to include
> only needed fields for the sentinel (and move them at start of ctl_table)
>

Here is the patch to reduce ram usage of sysctl :

[PATCH] sysctl: reduce ram usage by 40 %

We currently reserve space for a so called sentinel, a full struct ctl_table
for each ctl_table. We can cheat a bit since only needed fields of a sentinel
are ctl_name and procname. Add a new structure (struct ctl_table_sentinel)
that includes a full ctl_table and only required part of a sentinel.

Signed-off-by: Eric Dumazet <[email protected]>
---
include/linux/sysctl.h | 13 ++++++++++++-
kernel/sysctl.c | 19 ++++++++++---------
2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 1e4743e..6a1b1d5 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -1050,8 +1050,10 @@ extern ctl_handler sysctl_ms_jiffies;
/* A sysctl table is an array of struct ctl_table: */
struct ctl_table
{
- int ctl_name; /* Binary ID */
+ /* ctl_name and procname must be first fields (check sentinel) */
+ int ctl_name; /* Binary ID */
const char *procname; /* Text ID for /proc/sys, or zero */
+
void *data;
int maxlen;
mode_t mode;
@@ -1063,6 +1065,15 @@ struct ctl_table
void *extra2;
};

+/* ctl_table_sentinel : a ctl_table followed by a sentinel
+ * (null ctl & procname)
+ */
+struct ctl_table_sentinel {
+ struct ctl_table table;
+ int ctl_name;
+ const char *procname;
+};
+
struct ctl_table_root {
struct list_head root_list;
struct ctl_table_set default_set;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0d949c5..5d29dd8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2063,7 +2063,8 @@ struct ctl_table_header *__register_sysctl_paths(
const struct ctl_path *path, struct ctl_table *table)
{
struct ctl_table_header *header;
- struct ctl_table *new, **prevp;
+ struct ctl_table_sentinel *new;
+ struct ctl_table **prevp;
unsigned int n, npath;
struct ctl_table_set *set;

@@ -2080,24 +2081,24 @@ struct ctl_table_header *__register_sysctl_paths(
* worry about freeing additional memory in unregister_sysctl_table.
*/
header = kzalloc(sizeof(struct ctl_table_header) +
- (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
+ (npath * sizeof(struct ctl_table_sentinel)), GFP_KERNEL);
if (!header)
return NULL;

- new = (struct ctl_table *) (header + 1);
+ new = (struct ctl_table_sentinel *) (header + 1);

/* Now connect the dots */
prevp = &header->ctl_table;
for (n = 0; n < npath; ++n, ++path) {
/* Copy the procname */
- new->procname = path->procname;
- new->ctl_name = path->ctl_name;
- new->mode = 0555;
+ new->table.procname = path->procname;
+ new->table.ctl_name = path->ctl_name;
+ new->table.mode = 0555;

- *prevp = new;
- prevp = &new->child;
+ *prevp = &new->table;
+ prevp = &new->table.child;

- new += 2;
+ new++;
}
*prevp = table;
header->ctl_table_arg = table;

2009-11-03 10:23:13

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] sysctl: reduce ram usage by 40 %

Eric Dumazet <[email protected]> writes:

> Eric Dumazet a écrit :
>
>> Its curious because in my tests the biggest problems come from
>> kernel/sysctl.c (__register_sysctl_paths) consuming 80% of cpu
>> in following attempt to create 20.000 devices

I bet that is Al's cute glue all the sysctl data structures together
patch. It improves readdir and lookup at a small cost at registration
time.

>> (disable hotplug before trying this, and ipv6 too !)
>> modprobe dummy numdummies=20000


>> I believe we should address __register_sysctl_paths() scalability
>> problems too.

Agreed.

>> I dont know what is the 'sentinel' we allocate after each struct ctl_table
>> But I suspect we could reduce size requirement of the 'sentinel' to include
>> only needed fields for the sentinel (and move them at start of ctl_table)

The sentinel is just a NULL terminator.

> Here is the patch to reduce ram usage of sysctl :
>
> [PATCH] sysctl: reduce ram usage by 40 %
>
> We currently reserve space for a so called sentinel, a full struct ctl_table
> for each ctl_table. We can cheat a bit since only needed fields of a sentinel
> are ctl_name and procname. Add a new structure (struct ctl_table_sentinel)
> that includes a full ctl_table and only required part of a sentinel.

Before we address sysctl I would like to get out my patchset that
makes sys_sysctl a wrapper around the ascii version of
/proc/sys/net. Once that goes in it becomes much easier to do things
and perform radical surgery on sysctl. Little things like .ctl_name and
.strategy go away.

Have you happened to look at the other cost of /proc proper? Hmm.
Except for /proc/net/dev_snmp6 it doesn't look like we keep per
interface directories in proc so without ivp6 you won't see the proc
generic code at all.

The practical consequence is if /proc/net/dev_snmp6 is not painful during
registration right now we can probably convert all of /proc/sys/net to proc
generic after my other changes are in.

Eric

2009-11-03 10:41:39

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups

Benjamin LaHaise <[email protected]> writes:

> Use an rbtree in sysfs_dirent to speed up file lookup times
>
> Systems with large numbers (tens of thousands and more) of network
> interfaces stress the sysfs code in ways that make the linear search for
> a name match take far too long. Avoid this by using an rbtree.

Please take a look at the cleanups_scaling branch at:
kernel.org:/pub/scm/linux/kernel/git/ebiederm/linux-2.6.32-rc5-sysfs-enhancements

I haven't spent a lot of time on it but it is possible to get everything
except the rbtree without increasing the size of sysfs_dirent. Also we
don't need the both the rbtree and a linked list.

In particular see:
commit 50623bbb82da3bd1d596b9173a91ed1b5aa168b8
Author: Eric W. Biederman <[email protected]>
Date: Sat Oct 31 04:11:18 2009 -0700

sysfs: Sort sysfs directories by name hash.

This is a step in preparation for introducing a more efficient
data structure than a linked list for sysfs entries. By ordering
by name hash instead of by inode sysfs_lookup can be speeded
up as well as allowing restarting after seekdir.

Signed-off-by: Eric W. Biederman <[email protected]>

Meanwhile back to pushing the most important ones for real.

Eric

2009-11-03 16:10:25

by Greg KH

[permalink] [raw]
Subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups

On Tue, Nov 03, 2009 at 07:14:33AM +0100, Eric Dumazet wrote:
> Greg KH a ?crit :
> > On Sun, Nov 01, 2009 at 11:31:30AM -0500, Benjamin LaHaise wrote:
> >> Use an rbtree in sysfs_dirent to speed up file lookup times
> >>
> >> Systems with large numbers (tens of thousands and more) of network
> >> interfaces stress the sysfs code in ways that make the linear search for
> >> a name match take far too long. Avoid this by using an rbtree.
> >
> > What kind of speedups are you seeing here? And do these changes cause a
> > memory increase due to the structure changes which outweigh the
> > speedups?
> >
> > What kind of test are you doing to reproduce this?
> >
>
> Its curious because in my tests the biggest problems come from
> kernel/sysctl.c (__register_sysctl_paths) consuming 80% of cpu
> in following attempt to create 20.000 devices
>
> (disable hotplug before trying this, and ipv6 too !)
> modprobe dummy numdummies=20000
>
> I believe we should address __register_sysctl_paths() scalability
> problems too.

But registering 20000 devices is a far different problem from using
those 20000 devices :)

I think the "use the device" path should be the one we care the most
about fixing up, as that is much more common than the register path for
all users.

thanks,

greg k-h

2009-11-03 16:41:42

by Octavian Purdila

[permalink] [raw]
Subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups

On Tuesday 03 November 2009 18:07:15 you wrote:

> > > What kind of test are you doing to reproduce this?
> >
> > Its curious because in my tests the biggest problems come from
> > kernel/sysctl.c (__register_sysctl_paths) consuming 80% of cpu
> > in following attempt to create 20.000 devices
> >
> > (disable hotplug before trying this, and ipv6 too !)
> > modprobe dummy numdummies=20000
> >
> > I believe we should address __register_sysctl_paths() scalability
> > problems too.
>
> But registering 20000 devices is a far different problem from using
> those 20000 devices :)
>
> I think the "use the device" path should be the one we care the most
> about fixing up, as that is much more common than the register path for
> all users.
>

For sysctl in general probably, but I would argue that for dynamic network
interfaces (ppp and other sorts of tunnels) the "use" and "register" paths are
not that unbalanced.

For our case where we use up to 128K interfaces, sysctl entries per network
interface is pretty much unusable - but I agree that is not a very common case
:)

However [1] is not so far fetched.

[1] http://www.spinics.net/lists/netdev/msg110392.html

Thanks,
tavi

2009-11-03 16:45:47

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups

On Tue, Nov 03, 2009 at 08:07:15AM -0800, Greg KH wrote:
> But registering 20000 devices is a far different problem from using
> those 20000 devices :)

Registering 20,000 devices *is* a real world problem (I'm actually aiming
for 100,000, as that's what roughly fits in a single 10Gbps link -- something
that a mid range system can now route). When an edge router comes up from
reboot, or after a link has been down, the rate at which customers connect
is important -- too slow, and you get a pile of support calls from customers
complaining that their connection is down. Because of the data structures
used, there isn't even any improvement from an SMP system, so this needs
to be addressed directly.

-ben

2009-11-03 17:56:54

by Greg KH

[permalink] [raw]
Subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups

On Tue, Nov 03, 2009 at 11:45:50AM -0500, Benjamin LaHaise wrote:
> On Tue, Nov 03, 2009 at 08:07:15AM -0800, Greg KH wrote:
> > But registering 20000 devices is a far different problem from using
> > those 20000 devices :)
>
> Registering 20,000 devices *is* a real world problem (I'm actually aiming
> for 100,000, as that's what roughly fits in a single 10Gbps link -- something
> that a mid range system can now route). When an edge router comes up from
> reboot, or after a link has been down, the rate at which customers connect
> is important -- too slow, and you get a pile of support calls from customers
> complaining that their connection is down. Because of the data structures
> used, there isn't even any improvement from an SMP system, so this needs
> to be addressed directly.

Ok, how long are we talking about here?

thanks,

greg k-h

2009-11-03 20:02:08

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups

On Mon, Nov 02, 2009 at 07:50:58PM -0800, Greg KH wrote:
> On Sun, Nov 01, 2009 at 11:31:30AM -0500, Benjamin LaHaise wrote:
> > Use an rbtree in sysfs_dirent to speed up file lookup times
> >
> > Systems with large numbers (tens of thousands and more) of network
> > interfaces stress the sysfs code in ways that make the linear search for
> > a name match take far too long. Avoid this by using an rbtree.
>
> What kind of speedups are you seeing here? And do these changes cause a
> memory increase due to the structure changes which outweigh the
> speedups?

Depends on the number of interfaces being created. Without the patch,
interface creation time tends to double or worse for every 5,000-10,000
additional network interfaces.

> What kind of test are you doing to reproduce this?

I'm creating 30,000+ network interfaces, with the goal being 100,000.
With other hacks in the tree to get around the sysctl and procfs scaling
issues, as well as disabling things like NetworkManager, the results look
as follows:

Interfaces no-rb rbtree rbtree+list
0-5,000 13.8s 14.0s 13.0s
5,000-10,000 20.0s 17.4s 14.4s
10,000-15,000 27.3s 24.1s 16.9s
15,000-20,000 36.3s 32.2s 19.7s
20,000-25,000 45.2s 40.0s 22.9s
25,000-30,000 54.2s 48.2s 26.6s
30,000-35,000 63.9s 54.9s 30.7s

Thoughts?

-ben

2009-11-03 21:32:34

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups

Benjamin LaHaise <[email protected]> writes:

> On Mon, Nov 02, 2009 at 07:50:58PM -0800, Greg KH wrote:
>> On Sun, Nov 01, 2009 at 11:31:30AM -0500, Benjamin LaHaise wrote:
>> > Use an rbtree in sysfs_dirent to speed up file lookup times
>> >
>> > Systems with large numbers (tens of thousands and more) of network
>> > interfaces stress the sysfs code in ways that make the linear search for
>> > a name match take far too long. Avoid this by using an rbtree.
>>
>> What kind of speedups are you seeing here? And do these changes cause a
>> memory increase due to the structure changes which outweigh the
>> speedups?
>
> Depends on the number of interfaces being created. Without the patch,
> interface creation time tends to double or worse for every 5,000-10,000
> additional network interfaces.
>
>> What kind of test are you doing to reproduce this?
>
> I'm creating 30,000+ network interfaces, with the goal being 100,000.
> With other hacks in the tree to get around the sysctl and procfs scaling
> issues, as well as disabling things like NetworkManager, the results look
> as follows:
>
> Interfaces no-rb rbtree rbtree+list
> 0-5,000 13.8s 14.0s 13.0s
> 5,000-10,000 20.0s 17.4s 14.4s
> 10,000-15,000 27.3s 24.1s 16.9s
> 15,000-20,000 36.3s 32.2s 19.7s
> 20,000-25,000 45.2s 40.0s 22.9s
> 25,000-30,000 54.2s 48.2s 26.6s
> 30,000-35,000 63.9s 54.9s 30.7s
>
> Thoughts?

Something is very weird. I just took your no-rb numbers
and divided by the number of interfaces to see how well we are
scaling. I got:

Interfaces per-interface cost
5,000 0.002760s
10,000 0.002000s
15,000 0.001820s
20,000 0.001815s
25,000 0.001808s
30,000 0.001807s
35,000 0.001826s

I ran a variant of this test a long time ago and at that the
cost per interface grew with additional interfaces added. This
jives with the fact that the fundamental cost of adding N
network interfaces to sysfs is O(N^2).

Are your numbers from your application and are they real world?
In which case they are interesting, but it would be good if
we could also have microbenchmark numbers that just measure
the sysfs costs. If nothing else I am seeing a big startup
overhead that isn't being subtracted out that makes it hard
to see the real costs here.

Eric

2009-11-03 21:43:44

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups

[email protected] (Eric W. Biederman) writes:

> Benjamin LaHaise <[email protected]> writes:
>
>> On Mon, Nov 02, 2009 at 07:50:58PM -0800, Greg KH wrote:
>>> On Sun, Nov 01, 2009 at 11:31:30AM -0500, Benjamin LaHaise wrote:
>>> > Use an rbtree in sysfs_dirent to speed up file lookup times
>>> >
>>> > Systems with large numbers (tens of thousands and more) of network
>>> > interfaces stress the sysfs code in ways that make the linear search for
>>> > a name match take far too long. Avoid this by using an rbtree.
>>>
>>> What kind of speedups are you seeing here? And do these changes cause a
>>> memory increase due to the structure changes which outweigh the
>>> speedups?
>>
>> Depends on the number of interfaces being created. Without the patch,
>> interface creation time tends to double or worse for every 5,000-10,000
>> additional network interfaces.
>>
>>> What kind of test are you doing to reproduce this?
>>
>> I'm creating 30,000+ network interfaces, with the goal being 100,000.
>> With other hacks in the tree to get around the sysctl and procfs scaling
>> issues, as well as disabling things like NetworkManager, the results look
>> as follows:
>>
>> Interfaces no-rb rbtree rbtree+list
>> 0-5,000 13.8s 14.0s 13.0s
>> 5,000-10,000 20.0s 17.4s 14.4s
>> 10,000-15,000 27.3s 24.1s 16.9s
>> 15,000-20,000 36.3s 32.2s 19.7s
>> 20,000-25,000 45.2s 40.0s 22.9s
>> 25,000-30,000 54.2s 48.2s 26.6s
>> 30,000-35,000 63.9s 54.9s 30.7s
>>
>> Thoughts?
>
> Something is very weird. I just took your no-rb numbers
> and divided by the number of interfaces to see how well we are
> scaling. I got:
>
> Interfaces per-interface cost
> 5,000 0.002760s
> 10,000 0.002000s
> 15,000 0.001820s
> 20,000 0.001815s
> 25,000 0.001808s
> 30,000 0.001807s
> 35,000 0.001826s
>
> I ran a variant of this test a long time ago and at that the
> cost per interface grew with additional interfaces added. This
> jives with the fact that the fundamental cost of adding N
> network interfaces to sysfs is O(N^2).
>
> Are your numbers from your application and are they real world?
> In which case they are interesting, but it would be good if
> we could also have microbenchmark numbers that just measure
> the sysfs costs. If nothing else I am seeing a big startup
> overhead that isn't being subtracted out that makes it hard
> to see the real costs here.

I guess in particular what I would expect is that if we can do 35000
interfaces in 63s with an O(N^2) algorithm. Then we should be able to
do 35000 interfaces with an O(NlogN) algorithm in under a second.
Which for your application should make the time essentially flat in
the number of interfaces.

Until we get close to that I figure we need to do more digging.

Eric

2009-11-03 21:52:48

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups

On Tue, Nov 03, 2009 at 01:32:33PM -0800, Eric W. Biederman wrote:
> Are your numbers from your application and are they real world?
> In which case they are interesting, but it would be good if
> we could also have microbenchmark numbers that just measure
> the sysfs costs. If nothing else I am seeing a big startup
> overhead that isn't being subtracted out that makes it hard
> to see the real costs here.

They're application based, so there's a bunch of other overhead included
that won't show up on a microbenchmark. Each interface requires a round
trip between 2 L2TP daemons, so there are lots of syscalls and other cache
polluting effects that won't show up on a microbenchmark. One of the L2TP
daemons is configured not to instantiate any kernel state -- running in
this mode, it has very little overhead.

The other thing to note is that the costs posted are how long it takes to
add an additional 5,000 interfaces in the given range, not the total time
to add say 35,000 interfaces (I didn't feel like waiting that long).

-ben

2009-11-03 21:56:47

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups

On Tue, Nov 03, 2009 at 01:43:43PM -0800, Eric W. Biederman wrote:
> I guess in particular what I would expect is that if we can do 35000
> interfaces in 63s with an O(N^2) algorithm. Then we should be able to
> do 35000 interfaces with an O(NlogN) algorithm in under a second.
> Which for your application should make the time essentially flat in
> the number of interfaces.

That's the wrong way to interprete the numbers. The 35000 number of 63s is
the time that it takes 63s to add 5000 more interfaces in the 30,000 to
35,000 range. This includes the time required to add a point to point ip
route on the interface and bring the interface up.

-ben

2009-11-03 22:18:53

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups

Benjamin LaHaise <[email protected]> writes:

> On Tue, Nov 03, 2009 at 01:32:33PM -0800, Eric W. Biederman wrote:
>> Are your numbers from your application and are they real world?
>> In which case they are interesting, but it would be good if
>> we could also have microbenchmark numbers that just measure
>> the sysfs costs. If nothing else I am seeing a big startup
>> overhead that isn't being subtracted out that makes it hard
>> to see the real costs here.
>
> They're application based, so there's a bunch of other overhead included
> that won't show up on a microbenchmark. Each interface requires a round
> trip between 2 L2TP daemons, so there are lots of syscalls and other cache
> polluting effects that won't show up on a microbenchmark. One of the L2TP
> daemons is configured not to instantiate any kernel state -- running in
> this mode, it has very little overhead.
>
> The other thing to note is that the costs posted are how long it takes to
> add an additional 5,000 interfaces in the given range, not the total time
> to add say 35,000 interfaces (I didn't feel like waiting that long).

Ok. That makes a lot more sense. The times you posted ideally would be flat
but they go up from 12s to 60s.

Eric

2009-11-03 22:28:37

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 1/3] sysfs directory scaling: rbtree for dirent name lookups

Greg KH <[email protected]> writes:

> On Tue, Nov 03, 2009 at 07:14:33AM +0100, Eric Dumazet wrote:
>> Greg KH a ?crit :
>> > On Sun, Nov 01, 2009 at 11:31:30AM -0500, Benjamin LaHaise wrote:
>> >> Use an rbtree in sysfs_dirent to speed up file lookup times
>> >>
>> >> Systems with large numbers (tens of thousands and more) of network
>> >> interfaces stress the sysfs code in ways that make the linear search for
>> >> a name match take far too long. Avoid this by using an rbtree.
>> >
>> > What kind of speedups are you seeing here? And do these changes cause a
>> > memory increase due to the structure changes which outweigh the
>> > speedups?
>> >
>> > What kind of test are you doing to reproduce this?
>> >
>>
>> Its curious because in my tests the biggest problems come from
>> kernel/sysctl.c (__register_sysctl_paths) consuming 80% of cpu
>> in following attempt to create 20.000 devices
>>
>> (disable hotplug before trying this, and ipv6 too !)
>> modprobe dummy numdummies=20000
>>
>> I believe we should address __register_sysctl_paths() scalability
>> problems too.
>
> But registering 20000 devices is a far different problem from using
> those 20000 devices :)
>
> I think the "use the device" path should be the one we care the most
> about fixing up, as that is much more common than the register path for
> all users.

Definitely. Of the three proc sysctl and sysfs. sysctl tends to have
the worst costs across the board. They are all rarely used so a lot
of what gets hit when scaling are rare path events that even the most
horrible code works fine on small systems.

Usually slow registration times indicate an O(N^2) or worse data
structure for filename lookup.

Eric