2010-04-28 13:12:10

by tip-bot for Jack Steiner

[permalink] [raw]
Subject: [PATCH] - Randomize node rotor used in cpuset_mem_spread_node()


Some workloads that create a large number of small files tend to assign
too many pages to node 0 (multi-node systems). Part of the reason is that
the rotor (in cpuset_mem_spread_node()) used to assign nodes starts
at node 0 for newly created tasks.

This patch changes the rotor to be initialized to a random node number
of the cpuset.

Signed-off-by: Jack Steiner <[email protected]>


---
arch/x86/mm/numa.c | 17 +++++++++++++++++
include/linux/bitmap.h | 1 +
include/linux/nodemask.h | 5 +++++
kernel/fork.c | 4 ++++
lib/bitmap.c | 19 +++++++++++++++++++
5 files changed, 46 insertions(+)

Index: linux/arch/x86/mm/numa.c
===================================================================
--- linux.orig/arch/x86/mm/numa.c 2010-04-27 16:46:32.272216927 -0500
+++ linux/arch/x86/mm/numa.c 2010-04-28 08:07:09.394972376 -0500
@@ -2,6 +2,7 @@
#include <linux/topology.h>
#include <linux/module.h>
#include <linux/bootmem.h>
+#include <linux/random.h>

#ifdef CONFIG_DEBUG_PER_CPU_MAPS
# define DBG(x...) printk(KERN_DEBUG x)
@@ -65,3 +66,19 @@ const struct cpumask *cpumask_of_node(in
}
EXPORT_SYMBOL(cpumask_of_node);
#endif
+
+/*
+ * Return the bit number of a random bit set in the nodemask.
+ * (returns -1 if nodemask is empty)
+ */
+int __node_random(const nodemask_t *maskp)
+{
+ int w, bit = -1;
+
+ w = nodes_weight(maskp->bits);
+ if (w)
+ bit = bitmap_find_nth_bit(maskp->bits,
+ get_random_int() % w, MAX_NUMNODES);
+ return bit;
+}
+EXPORT_SYMBOL(__node_random);
Index: linux/include/linux/bitmap.h
===================================================================
--- linux.orig/include/linux/bitmap.h 2010-04-27 16:46:32.272216927 -0500
+++ linux/include/linux/bitmap.h 2010-04-27 16:46:36.402929137 -0500
@@ -130,6 +130,7 @@ extern int bitmap_find_free_region(unsig
extern void bitmap_release_region(unsigned long *bitmap, int pos, int order);
extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits);
+extern int bitmap_find_nth_bit(const unsigned long *bitmap, int n, int bits);

#define BITMAP_LAST_WORD_MASK(nbits) \
( \
Index: linux/include/linux/nodemask.h
===================================================================
--- linux.orig/include/linux/nodemask.h 2010-04-27 16:46:32.272216927 -0500
+++ linux/include/linux/nodemask.h 2010-04-27 16:46:36.430899232 -0500
@@ -71,6 +71,8 @@
*
* int any_online_node(mask) First online node in mask
*
+ * int node_random(mask) Random node that is set in mask
+ *
* node_set_online(node) set bit 'node' in node_online_map
* node_set_offline(node) clear bit 'node' in node_online_map
*
@@ -264,6 +266,9 @@ static inline int __first_unset_node(con
find_first_zero_bit(maskp->bits, MAX_NUMNODES));
}

+#define node_random(mask) __node_random(&(mask))
+extern int __node_random(const nodemask_t *maskp);
+
#define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES)

#if MAX_NUMNODES <= BITS_PER_LONG
Index: linux/kernel/fork.c
===================================================================
--- linux.orig/kernel/fork.c 2010-04-27 16:46:32.276200259 -0500
+++ linux/kernel/fork.c 2010-04-28 08:06:37.106967112 -0500
@@ -1097,6 +1097,10 @@ static struct task_struct *copy_process(
}
mpol_fix_fork_child_flag(p);
#endif
+#ifdef CONFIG_CPUSETS
+ p->cpuset_mem_spread_rotor = node_random(p->mems_allowed);
+ p->cpuset_slab_spread_rotor = node_random(p->mems_allowed);
+#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
Index: linux/lib/bitmap.c
===================================================================
--- linux.orig/lib/bitmap.c 2010-04-27 16:46:32.272216927 -0500
+++ linux/lib/bitmap.c 2010-04-27 16:46:36.570946439 -0500
@@ -1022,3 +1022,22 @@ void bitmap_copy_le(void *dst, const uns
}
}
EXPORT_SYMBOL(bitmap_copy_le);
+
+/**
+ * bitmap_find_nth_bit(buf, ord, bits)
+ * @buf: pointer to bitmap
+ * @n: ordinal bit position (n-th set bit, n >= 0)
+ * @nbits: number of bits in the bitmap
+ *
+ * find the Nth bit that is set in the bitmap
+ * Value of @n should be in range 0 <= @n < weight(buf), else
+ * results are undefined.
+ *
+ * The bit positions 0 through @bits are valid positions in @buf.
+ */
+int bitmap_find_nth_bit(const unsigned long *bitmap, int n, int bits)
+{
+ return bitmap_ord_to_pos(bitmap, n, bits);
+}
+EXPORT_SYMBOL(bitmap_find_nth_bit);
+


2010-04-28 15:04:35

by tip-bot for Jack Steiner

[permalink] [raw]
Subject: Re: [PATCH v2] - Randomize node rotor used in cpuset_mem_spread_node()

Some workloads that create a large number of small files tend to assign
too many pages to node 0 (multi-node systems). Part of the reason is that
the rotor (in cpuset_mem_spread_node()) used to assign nodes starts
at node 0 for newly created tasks.

This patch changes the rotor to be initialized to a random node number
of the cpuset.

NOTE: this patch is on TOP of the previous patch:
[PATCH] - New round-robin rotor for SLAB allocations
http://marc.info/?l=linux-mm&m=127231565321947&w=2

Signed-off-by: Jack Steiner <[email protected]>


---

V2 - initial patch was generated against the wrong tree. This patch is based
on linux-next.



arch/x86/mm/numa.c | 17 +++++++++++++++++
include/linux/bitmap.h | 1 +
include/linux/nodemask.h | 5 +++++
kernel/fork.c | 4 ++++
lib/bitmap.c | 19 +++++++++++++++++++
5 files changed, 46 insertions(+)

Index: linux/arch/x86/mm/numa.c
===================================================================
--- linux.orig/arch/x86/mm/numa.c 2010-04-28 09:44:52.422898844 -0500
+++ linux/arch/x86/mm/numa.c 2010-04-28 09:49:39.282899779 -0500
@@ -2,6 +2,7 @@
#include <linux/topology.h>
#include <linux/module.h>
#include <linux/bootmem.h>
+#include <linux/random.h>

#ifdef CONFIG_DEBUG_PER_CPU_MAPS
# define DBG(x...) printk(KERN_DEBUG x)
@@ -65,3 +66,19 @@ const struct cpumask *cpumask_of_node(in
}
EXPORT_SYMBOL(cpumask_of_node);
#endif
+
+/*
+ * Return the bit number of a random bit set in the nodemask.
+ * (returns -1 if nodemask is empty)
+ */
+int __node_random(const nodemask_t *maskp)
+{
+ int w, bit = -1;
+
+ w = nodes_weight(*maskp);
+ if (w)
+ bit = bitmap_find_nth_bit(maskp->bits,
+ get_random_int() % w, MAX_NUMNODES);
+ return bit;
+}
+EXPORT_SYMBOL(__node_random);
Index: linux/include/linux/bitmap.h
===================================================================
--- linux.orig/include/linux/bitmap.h 2010-04-28 09:44:52.494903609 -0500
+++ linux/include/linux/bitmap.h 2010-04-28 09:45:24.952154639 -0500
@@ -141,6 +141,7 @@ extern int bitmap_find_free_region(unsig
extern void bitmap_release_region(unsigned long *bitmap, int pos, int order);
extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits);
+extern int bitmap_find_nth_bit(const unsigned long *bitmap, int n, int bits);

#define BITMAP_LAST_WORD_MASK(nbits) \
( \
Index: linux/include/linux/nodemask.h
===================================================================
--- linux.orig/include/linux/nodemask.h 2010-04-28 09:44:52.494903609 -0500
+++ linux/include/linux/nodemask.h 2010-04-28 09:45:24.984152911 -0500
@@ -66,6 +66,8 @@
* int num_online_nodes() Number of online Nodes
* int num_possible_nodes() Number of all possible Nodes
*
+ * int node_random(mask) Random node with set bit in mask
+ *
* int node_online(node) Is some node online?
* int node_possible(node) Is some node possible?
*
@@ -267,6 +269,9 @@ static inline int __first_unset_node(con
find_first_zero_bit(maskp->bits, MAX_NUMNODES));
}

+#define node_random(mask) __node_random(&(mask))
+extern int __node_random(const nodemask_t *maskp);
+
#define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES)

#if MAX_NUMNODES <= BITS_PER_LONG
Index: linux/kernel/fork.c
===================================================================
--- linux.orig/kernel/fork.c 2010-04-28 09:44:52.494903609 -0500
+++ linux/kernel/fork.c 2010-04-28 09:49:39.282899779 -0500
@@ -1079,6 +1079,10 @@ static struct task_struct *copy_process(
}
mpol_fix_fork_child_flag(p);
#endif
+#ifdef CONFIG_CPUSETS
+ p->cpuset_mem_spread_rotor = node_random(p->mems_allowed);
+ p->cpuset_slab_spread_rotor = node_random(p->mems_allowed);
+#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
Index: linux/lib/bitmap.c
===================================================================
--- linux.orig/lib/bitmap.c 2010-04-28 09:44:52.494903609 -0500
+++ linux/lib/bitmap.c 2010-04-28 09:49:39.282899779 -0500
@@ -1098,3 +1098,22 @@ void bitmap_copy_le(void *dst, const uns
}
}
EXPORT_SYMBOL(bitmap_copy_le);
+
+/**
+ * bitmap_find_nth_bit(buf, ord, bits)
+ * @buf: pointer to bitmap
+ * @n: ordinal bit position (n-th set bit, n >= 0)
+ * @nbits: number of bits in the bitmap
+ *
+ * find the Nth bit that is set in the bitmap
+ * Value of @n should be in range 0 <= @n < weight(buf), else
+ * results are undefined.
+ *
+ * The bit positions 0 through @bits are valid positions in @buf.
+ */
+int bitmap_find_nth_bit(const unsigned long *bitmap, int n, int bits)
+{
+ return bitmap_ord_to_pos(bitmap, n, bits);
+}
+EXPORT_SYMBOL(bitmap_find_nth_bit);
+

2010-04-28 22:40:40

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH v2] - Randomize node rotor used in cpuset_mem_spread_node()

On Wed, 28 Apr 2010 10:04:32 -0500
Jack Steiner <[email protected]> wrote:

> Some workloads that create a large number of small files tend to assign
> too many pages to node 0 (multi-node systems). Part of the reason is that
> the rotor (in cpuset_mem_spread_node()) used to assign nodes starts
> at node 0 for newly created tasks.

And, presumably, your secret testcase forks lots of subprocesses which
do the file creation?

> This patch changes the rotor to be initialized to a random node number
> of the cpuset.

Why random as opposed to, say, inherit-rotor-from-parent?

> Index: linux/arch/x86/mm/numa.c
> ===================================================================
> --- linux.orig/arch/x86/mm/numa.c 2010-04-28 09:44:52.422898844 -0500
> +++ linux/arch/x86/mm/numa.c 2010-04-28 09:49:39.282899779 -0500
> @@ -2,6 +2,7 @@
> #include <linux/topology.h>
> #include <linux/module.h>
> #include <linux/bootmem.h>
> +#include <linux/random.h>
>
> #ifdef CONFIG_DEBUG_PER_CPU_MAPS
> # define DBG(x...) printk(KERN_DEBUG x)
> @@ -65,3 +66,19 @@ const struct cpumask *cpumask_of_node(in
> }
> EXPORT_SYMBOL(cpumask_of_node);
> #endif
> +
> +/*
> + * Return the bit number of a random bit set in the nodemask.
> + * (returns -1 if nodemask is empty)
> + */
> +int __node_random(const nodemask_t *maskp)
> +{
> + int w, bit = -1;
> +
> + w = nodes_weight(*maskp);
> + if (w)
> + bit = bitmap_find_nth_bit(maskp->bits,
> + get_random_int() % w, MAX_NUMNODES);
> + return bit;
> +}
> +EXPORT_SYMBOL(__node_random);

I suspect random32() would suffice here. It avoids depleting the
entropy pool altogether.

> +
> +/**
> + * bitmap_find_nth_bit(buf, ord, bits)
> + * @buf: pointer to bitmap
> + * @n: ordinal bit position (n-th set bit, n >= 0)
> + * @nbits: number of bits in the bitmap
> + *
> + * find the Nth bit that is set in the bitmap
> + * Value of @n should be in range 0 <= @n < weight(buf), else
> + * results are undefined.
> + *
> + * The bit positions 0 through @bits are valid positions in @buf.
> + */
> +int bitmap_find_nth_bit(const unsigned long *bitmap, int n, int bits)
> +{
> + return bitmap_ord_to_pos(bitmap, n, bits);
> +}
> +EXPORT_SYMBOL(bitmap_find_nth_bit);

This does nothing apart from consume more stack? Better to rename
bitmap_ord_to_pos() and export it.

2010-04-28 23:04:13

by Matt Mackall

[permalink] [raw]
Subject: Re: [PATCH v2] - Randomize node rotor used in cpuset_mem_spread_node()

On Wed, 2010-04-28 at 15:40 -0700, Andrew Morton wrote:
> On Wed, 28 Apr 2010 10:04:32 -0500
> Jack Steiner <[email protected]> wrote:
>
> > Some workloads that create a large number of small files tend to assign
> > too many pages to node 0 (multi-node systems). Part of the reason is that
> > the rotor (in cpuset_mem_spread_node()) used to assign nodes starts
> > at node 0 for newly created tasks.
>
> And, presumably, your secret testcase forks lots of subprocesses which
> do the file creation?
>
> > This patch changes the rotor to be initialized to a random node number
> > of the cpuset.
>
> Why random as opposed to, say, inherit-rotor-from-parent?

That'd be fine, I bet.

> > Index: linux/arch/x86/mm/numa.c
> > ===================================================================
> > --- linux.orig/arch/x86/mm/numa.c 2010-04-28 09:44:52.422898844 -0500
> > +++ linux/arch/x86/mm/numa.c 2010-04-28 09:49:39.282899779 -0500
> > @@ -2,6 +2,7 @@
> > #include <linux/topology.h>
> > #include <linux/module.h>
> > #include <linux/bootmem.h>
> > +#include <linux/random.h>
> >
> > #ifdef CONFIG_DEBUG_PER_CPU_MAPS
> > # define DBG(x...) printk(KERN_DEBUG x)
> > @@ -65,3 +66,19 @@ const struct cpumask *cpumask_of_node(in
> > }
> > EXPORT_SYMBOL(cpumask_of_node);
> > #endif
> > +
> > +/*
> > + * Return the bit number of a random bit set in the nodemask.
> > + * (returns -1 if nodemask is empty)
> > + */
> > +int __node_random(const nodemask_t *maskp)
> > +{
> > + int w, bit = -1;
> > +
> > + w = nodes_weight(*maskp);
> > + if (w)
> > + bit = bitmap_find_nth_bit(maskp->bits,
> > + get_random_int() % w, MAX_NUMNODES);
> > + return bit;
> > +}
> > +EXPORT_SYMBOL(__node_random);
>
> I suspect random32() would suffice here. It avoids depleting the
> entropy pool altogether.

I wouldn't worry about that. get_random_int() touches the urandom pool,
which will always leave entropy around. Also, Ted and I decided over a
year ago that we should drop the whole entropy accounting framework,
which I'll get around to some rainy weekend.

--
http://selenic.com : development and support for Mercurial and Linux

2010-04-28 23:12:51

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH v2] - Randomize node rotor used in cpuset_mem_spread_node()

On Wed, 28 Apr 2010 18:04:06 -0500
Matt Mackall <[email protected]> wrote:

> > I suspect random32() would suffice here. It avoids depleting the
> > entropy pool altogether.
>
> I wouldn't worry about that. get_random_int() touches the urandom pool,
> which will always leave entropy around. Also, Ted and I decided over a
> year ago that we should drop the whole entropy accounting framework,
> which I'll get around to some rainy weekend.

hm, so why does random32() exist? Speed?

2010-04-28 23:22:59

by Stephen Hemminger

[permalink] [raw]
Subject: Re: [PATCH v2] - Randomize node rotor used in cpuset_mem_spread_node()

On Wed, 28 Apr 2010 16:12:44 -0700
Andrew Morton <[email protected]> wrote:

> On Wed, 28 Apr 2010 18:04:06 -0500
> Matt Mackall <[email protected]> wrote:
>
> > > I suspect random32() would suffice here. It avoids depleting the
> > > entropy pool altogether.
> >
> > I wouldn't worry about that. get_random_int() touches the urandom pool,
> > which will always leave entropy around. Also, Ted and I decided over a
> > year ago that we should drop the whole entropy accounting framework,
> > which I'll get around to some rainy weekend.
>
> hm, so why does random32() exist? Speed?

Because I need a cheap fast pseudo-random source for emulation
and it got used for more and more non-cryptographic uses.
And like most random generators people keep forgetting that
it was not intended for security use.

--

2010-04-28 23:28:11

by Matt Mackall

[permalink] [raw]
Subject: Re: [PATCH v2] - Randomize node rotor used in cpuset_mem_spread_node()

On Wed, 2010-04-28 at 16:12 -0700, Andrew Morton wrote:
> On Wed, 28 Apr 2010 18:04:06 -0500
> Matt Mackall <[email protected]> wrote:
>
> > > I suspect random32() would suffice here. It avoids depleting the
> > > entropy pool altogether.
> >
> > I wouldn't worry about that. get_random_int() touches the urandom pool,
> > which will always leave entropy around. Also, Ted and I decided over a
> > year ago that we should drop the whole entropy accounting framework,
> > which I'll get around to some rainy weekend.
>
> hm, so why does random32() exist? Speed?

Yep. There are lots of RNG uses that aren't security sensitive and this
is one: the kernel won't be DoSed by an attacker that gets all pages
preferentially allocated on one node. Performance will suffer, but it's
reasonably bounded.

One of my goals is to call these sorts of trade-offs out in the API, ie:

get_fast_random_u32()
get_fast_random_bytes()
get_secure_random_u32()
get_secure_random_bytes()

--
http://selenic.com : development and support for Mercurial and Linux

2010-04-30 16:51:13

by tip-bot for Jack Steiner

[permalink] [raw]
Subject: Re: [PATCH v3] - Randomize node rotor used in cpuset_mem_spread_node()


Some workloads that create a large number of small files tend to assign
too many pages to node 0 (multi-node systems). Part of the reason is that
the rotor (in cpuset_mem_spread_node()) used to assign nodes starts
at node 0 for newly created tasks.

This patch changes the rotor to be initialized to a random node number
of the cpuset.

NOTE: this patch is on TOP of the previous patch:
[PATCH] - New round-robin rotor for SLAB allocations
http://marc.info/?l=linux-mm&m=127231565321947&w=2

Signed-off-by: Jack Steiner <[email protected]>


---

V2 - initial patch was generated against the wrong tree. This patch is based
on linux-next.

V3 - Eliminate new function bitmap_find_nth_bit(). Rename bitmap_ord_to_pos()
and export it.



arch/x86/mm/numa.c | 17 +++++++++++++++++
include/linux/bitmap.h | 1 +
include/linux/nodemask.h | 5 +++++
kernel/fork.c | 4 ++++
lib/bitmap.c | 2 +-
5 files changed, 28 insertions(+), 1 deletion(-)

Index: linux/arch/x86/mm/numa.c
===================================================================
--- linux.orig/arch/x86/mm/numa.c 2010-04-28 09:59:10.158899192 -0500
+++ linux/arch/x86/mm/numa.c 2010-04-29 13:53:48.531783017 -0500
@@ -2,6 +2,7 @@
#include <linux/topology.h>
#include <linux/module.h>
#include <linux/bootmem.h>
+#include <linux/random.h>

#ifdef CONFIG_DEBUG_PER_CPU_MAPS
# define DBG(x...) printk(KERN_DEBUG x)
@@ -65,3 +66,19 @@ const struct cpumask *cpumask_of_node(in
}
EXPORT_SYMBOL(cpumask_of_node);
#endif
+
+/*
+ * Return the bit number of a random bit set in the nodemask.
+ * (returns -1 if nodemask is empty)
+ */
+int __node_random(const nodemask_t *maskp)
+{
+ int w, bit = -1;
+
+ w = nodes_weight(*maskp);
+ if (w)
+ bit = bitmap_ord_to_pos(maskp->bits,
+ get_random_int() % w, MAX_NUMNODES);
+ return bit;
+}
+EXPORT_SYMBOL(__node_random);
Index: linux/include/linux/bitmap.h
===================================================================
--- linux.orig/include/linux/bitmap.h 2010-04-28 09:59:10.158899192 -0500
+++ linux/include/linux/bitmap.h 2010-04-29 13:54:04.639408240 -0500
@@ -141,6 +141,7 @@ extern int bitmap_find_free_region(unsig
extern void bitmap_release_region(unsigned long *bitmap, int pos, int order);
extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits);
+extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits);

#define BITMAP_LAST_WORD_MASK(nbits) \
( \
Index: linux/include/linux/nodemask.h
===================================================================
--- linux.orig/include/linux/nodemask.h 2010-04-28 09:59:10.158899192 -0500
+++ linux/include/linux/nodemask.h 2010-04-28 10:01:42.878971800 -0500
@@ -66,6 +66,8 @@
* int num_online_nodes() Number of online Nodes
* int num_possible_nodes() Number of all possible Nodes
*
+ * int node_random(mask) Random node with set bit in mask
+ *
* int node_online(node) Is some node online?
* int node_possible(node) Is some node possible?
*
@@ -267,6 +269,9 @@ static inline int __first_unset_node(con
find_first_zero_bit(maskp->bits, MAX_NUMNODES));
}

+#define node_random(mask) __node_random(&(mask))
+extern int __node_random(const nodemask_t *maskp);
+
#define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES)

#if MAX_NUMNODES <= BITS_PER_LONG
Index: linux/kernel/fork.c
===================================================================
--- linux.orig/kernel/fork.c 2010-04-28 09:59:10.158899192 -0500
+++ linux/kernel/fork.c 2010-04-28 10:03:11.580014823 -0500
@@ -1079,6 +1079,10 @@ static struct task_struct *copy_process(
}
mpol_fix_fork_child_flag(p);
#endif
+#ifdef CONFIG_CPUSETS
+ p->cpuset_mem_spread_rotor = node_random(p->mems_allowed);
+ p->cpuset_slab_spread_rotor = node_random(p->mems_allowed);
+#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
Index: linux/lib/bitmap.c
===================================================================
--- linux.orig/lib/bitmap.c 2010-04-28 09:59:10.158899192 -0500
+++ linux/lib/bitmap.c 2010-04-29 13:53:28.270906295 -0500
@@ -672,7 +672,7 @@ static int bitmap_pos_to_ord(const unsig
*
* The bit positions 0 through @bits are valid positions in @buf.
*/
-static int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits)
+int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits)
{
int pos = 0;

2010-04-30 17:36:14

by Robin Holt

[permalink] [raw]
Subject: Re: [PATCH v2] - Randomize node rotor used in cpuset_mem_spread_node()

On Wed, Apr 28, 2010 at 03:40:34PM -0700, Andrew Morton wrote:
> On Wed, 28 Apr 2010 10:04:32 -0500
> Jack Steiner <[email protected]> wrote:
>
> > Some workloads that create a large number of small files tend to assign
> > too many pages to node 0 (multi-node systems). Part of the reason is that
> > the rotor (in cpuset_mem_spread_node()) used to assign nodes starts
> > at node 0 for newly created tasks.
>
> And, presumably, your secret testcase forks lots of subprocesses which
> do the file creation?

I think the test case he was using was aim7 or a kernel compile.
Anything that opens a lot of small files will quickly deplete node 0.

> > This patch changes the rotor to be initialized to a random node number
> > of the cpuset.
>
> Why random as opposed to, say, inherit-rotor-from-parent?

If I have something like a find ... -exec grep ..., won't the pages
be biased towards the nodes adjacent to the parent's rotor values.
Maybe I misunderstood Jack's problem, but I believe that was what he
was seeing and why he chose random.

I hope I did not misunderstand Jack's problem and mislead this discussion.

Thanks,
Robin Holt

2010-04-30 18:14:52

by tip-bot for Jack Steiner

[permalink] [raw]
Subject: Re: [PATCH v2] - Randomize node rotor used in cpuset_mem_spread_node()

On Wed, Apr 28, 2010 at 03:40:34PM -0700, Andrew Morton wrote:
> On Wed, 28 Apr 2010 10:04:32 -0500
> Jack Steiner <[email protected]> wrote:
>
> > Some workloads that create a large number of small files tend to assign
> > too many pages to node 0 (multi-node systems). Part of the reason is that
> > the rotor (in cpuset_mem_spread_node()) used to assign nodes starts
> > at node 0 for newly created tasks.
>
> And, presumably, your secret testcase forks lots of subprocesses which
> do the file creation?

We've seen this on several workloads. None were contrived - just standard benchmarks
that use tasks/scripts to create a large number of files. I have not looked
in detail at the tasks/scripts. It seemed desirable to have each new
task start with a random value in the rotor. That would provide
the best randomness.


>
> > This patch changes the rotor to be initialized to a random node number
> > of the cpuset.
>
> Why random as opposed to, say, inherit-rotor-from-parent?

I was concerned that inherit-from-parant might not be effective if the
files were created using a single task that forked child processes to create
the files. Each child would inherit the same rotor value.


>
> > Index: linux/arch/x86/mm/numa.c
> > ===================================================================
> > --- linux.orig/arch/x86/mm/numa.c 2010-04-28 09:44:52.422898844 -0500
> > +++ linux/arch/x86/mm/numa.c 2010-04-28 09:49:39.282899779 -0500
> > @@ -2,6 +2,7 @@
> > #include <linux/topology.h>
> > #include <linux/module.h>
> > #include <linux/bootmem.h>
> > +#include <linux/random.h>
> >
> > #ifdef CONFIG_DEBUG_PER_CPU_MAPS
> > # define DBG(x...) printk(KERN_DEBUG x)
> > @@ -65,3 +66,19 @@ const struct cpumask *cpumask_of_node(in
> > }
> > EXPORT_SYMBOL(cpumask_of_node);
> > #endif
> > +
> > +/*
> > + * Return the bit number of a random bit set in the nodemask.
> > + * (returns -1 if nodemask is empty)
> > + */
> > +int __node_random(const nodemask_t *maskp)
> > +{
> > + int w, bit = -1;
> > +
> > + w = nodes_weight(*maskp);
> > + if (w)
> > + bit = bitmap_find_nth_bit(maskp->bits,
> > + get_random_int() % w, MAX_NUMNODES);
> > + return bit;
> > +}
> > +EXPORT_SYMBOL(__node_random);
>
> I suspect random32() would suffice here. It avoids depleting the
> entropy pool altogether.
>
> > +
> > +/**
> > + * bitmap_find_nth_bit(buf, ord, bits)
> > + * @buf: pointer to bitmap
> > + * @n: ordinal bit position (n-th set bit, n >= 0)
> > + * @nbits: number of bits in the bitmap
> > + *
> > + * find the Nth bit that is set in the bitmap
> > + * Value of @n should be in range 0 <= @n < weight(buf), else
> > + * results are undefined.
> > + *
> > + * The bit positions 0 through @bits are valid positions in @buf.
> > + */
> > +int bitmap_find_nth_bit(const unsigned long *bitmap, int n, int bits)
> > +{
> > + return bitmap_ord_to_pos(bitmap, n, bits);
> > +}
> > +EXPORT_SYMBOL(bitmap_find_nth_bit);
>
> This does nothing apart from consume more stack? Better to rename
> bitmap_ord_to_pos() and export it.

Agree. Not sure why I did it that way. Fixed in next version of the patch.


--- jack