The cpuset_zone_allowed() variants are actually only a function of the
zone's node.
Cc: Paul Menage <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Randy Dunlap <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
include/linux/cpuset.h | 33 ++++++++++++++++++++++-----
kernel/cpuset.c | 59 ++++++++++++++++++++---------------------------
2 files changed, 52 insertions(+), 40 deletions(-)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -12,6 +12,7 @@
#include <linux/cpumask.h>
#include <linux/nodemask.h>
#include <linux/cgroup.h>
+#include <linux/mm.h>
#ifdef CONFIG_CPUSETS
@@ -29,19 +30,29 @@ void cpuset_init_current_mems_allowed(void);
void cpuset_update_task_memory_state(void);
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
-extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask);
-extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask);
+extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask);
+extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask);
-static int inline cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
{
return number_of_cpusets <= 1 ||
- __cpuset_zone_allowed_softwall(z, gfp_mask);
+ __cpuset_node_allowed_softwall(node, gfp_mask);
}
-static int inline cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
{
return number_of_cpusets <= 1 ||
- __cpuset_zone_allowed_hardwall(z, gfp_mask);
+ __cpuset_node_allowed_hardwall(node, gfp_mask);
+}
+
+static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+{
+ return cpuset_node_allowed_softwall(zone_to_nid(z), gfp_mask);
+}
+
+static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+{
+ return cpuset_node_allowed_hardwall(zone_to_nid(z), gfp_mask);
}
extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
@@ -112,6 +123,16 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
return 1;
}
+static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
+{
+ return 1;
+}
+
+static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
+{
+ return 1;
+}
+
static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
{
return 1;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2206,26 +2206,24 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
}
/**
- * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
- * @z: is this zone on an allowed node?
+ * cpuset_node_allowed_softwall - Can we allocate on a memory node?
+ * @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
- * If we're in interrupt, yes, we can always allocate. If
- * __GFP_THISNODE is set, yes, we can always allocate. If zone
- * z's node is in our tasks mems_allowed, yes. If it's not a
- * __GFP_HARDWALL request and this zone's nodes is in the nearest
- * hardwalled cpuset ancestor to this tasks cpuset, yes.
- * If the task has been OOM killed and has access to memory reserves
- * as specified by the TIF_MEMDIE flag, yes.
+ * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
+ * set, yes, we can always allocate. If node is in our task's mems_allowed,
+ * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest
+ * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been
+ * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
+ * flag, yes.
* Otherwise, no.
*
- * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
- * reduces to cpuset_zone_allowed_hardwall(). Otherwise,
- * cpuset_zone_allowed_softwall() might sleep, and might allow a zone
- * from an enclosing cpuset.
+ * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
+ * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()
+ * might sleep, and might allow a node from an enclosing cpuset.
*
- * cpuset_zone_allowed_hardwall() only handles the simpler case of
- * hardwall cpusets, and never sleeps.
+ * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
+ * cpusets, and never sleeps.
*
* The __GFP_THISNODE placement logic is really handled elsewhere,
* by forcibly using a zonelist starting at a specified node, and by
@@ -2264,20 +2262,17 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
* GFP_USER - only nodes in current tasks mems allowed ok.
*
* Rule:
- * Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
+ * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
* pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
* the code that might scan up ancestor cpusets and sleep.
*/
-
-int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
{
- int node; /* node that zone z is on */
const struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
return 1;
- node = zone_to_nid(z);
might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
if (node_isset(node, current->mems_allowed))
return 1;
@@ -2306,15 +2301,15 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
}
/*
- * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
- * @z: is this zone on an allowed node?
+ * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
+ * @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
- * If we're in interrupt, yes, we can always allocate.
- * If __GFP_THISNODE is set, yes, we can always allocate. If zone
- * z's node is in our tasks mems_allowed, yes. If the task has been
- * OOM killed and has access to memory reserves as specified by the
- * TIF_MEMDIE flag, yes. Otherwise, no.
+ * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
+ * set, yes, we can always allocate. If node is in our task's mems_allowed,
+ * yes. If the task has been OOM killed and has access to memory reserves as
+ * specified by the TIF_MEMDIE flag, yes.
+ * Otherwise, no.
*
* The __GFP_THISNODE placement logic is really handled elsewhere,
* by forcibly using a zonelist starting at a specified node, and by
@@ -2322,20 +2317,16 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
* any node on the zonelist except the first. By the time any such
* calls get to this routine, we should just shut up and say 'yes'.
*
- * Unlike the cpuset_zone_allowed_softwall() variant, above,
- * this variant requires that the zone be in the current tasks
+ * Unlike the cpuset_node_allowed_softwall() variant, above,
+ * this variant requires that the node be in the current task's
* mems_allowed or that we're in interrupt. It does not scan up the
* cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
* It never sleeps.
*/
-
-int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
{
- int node; /* node that zone z is on */
-
if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
return 1;
- node = zone_to_nid(z);
if (node_isset(node, current->mems_allowed))
return 1;
/*
Slab allocations should respect cpuset hardwall restrictions. Otherwise,
it is possible for tasks in a cpuset to fill slabs allocated on mems
assigned to a disjoint cpuset.
When an allocation is attempted for a cpu slab that resides on a node that
is not allowed by a task's cpuset, an appropriate partial slab or new slab
is allocated.
If an allocation is intended for a particular node that the task does not
have access to because of its cpuset, an allowed partial slab is used
instead of failing.
Cc: Christoph Lameter <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
mm/slub.c | 10 ++++++----
1 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/mm/slub.c b/mm/slub.c
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1353,6 +1353,8 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
struct page *page;
int searchnode = (node == -1) ? numa_node_id() : node;
+ if (!cpuset_node_allowed_hardwall(searchnode, flags))
+ searchnode = cpuset_mem_spread_node();
page = get_partial_node(get_node(s, searchnode));
if (page || (flags & __GFP_THISNODE))
return page;
@@ -1477,13 +1479,13 @@ static void flush_all(struct kmem_cache *s)
* Check if the objects in a per cpu structure fit numa
* locality expectations.
*/
-static inline int node_match(struct kmem_cache_cpu *c, int node)
+static inline int node_match(struct kmem_cache_cpu *c, int node, gfp_t gfpflags)
{
#ifdef CONFIG_NUMA
if (node != -1 && c->node != node)
return 0;
#endif
- return 1;
+ return cpuset_node_allowed_hardwall(c->node, gfpflags);
}
/*
@@ -1517,7 +1519,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
goto new_slab;
slab_lock(c->page);
- if (unlikely(!node_match(c, node)))
+ if (unlikely(!node_match(c, node, gfpflags)))
goto another_slab;
stat(c, ALLOC_REFILL);
@@ -1604,7 +1606,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
local_irq_save(flags);
c = get_cpu_slab(s, smp_processor_id());
objsize = c->objsize;
- if (unlikely(!c->freelist || !node_match(c, node)))
+ if (unlikely(!c->freelist || !node_match(c, node, gfpflags)))
object = __slab_alloc(s, gfpflags, node, addr, c);
On Mon, 2 Mar 2009, David Rientjes wrote:
> The cpuset_zone_allowed() variants are actually only a function of the
> zone's node.
Good observation.
Acked-by: Christoph Lameter <[email protected]>
On Mon, 2 Mar 2009, David Rientjes wrote:
> Slab allocations should respect cpuset hardwall restrictions. Otherwise,
> it is possible for tasks in a cpuset to fill slabs allocated on mems
> assigned to a disjoint cpuset.
Not sure that I understand this correctly. If multiple tasks are running
on the same processor that are part of disjoint cpusets and both taska are
performing slab allocations without specifying a node then one task could
allocate a page from the first cpuset, take one object from it and then
the second task on the same cpu could consume the rest from a nodeset that
it would otherwise not be allowed to access. On the other hand it is
likely that the second task will also allocate memory from its allowed
nodes that are then consumed by the first task. This is a tradeoff coming
with the pushing of the enforcement of memory policy / cpuset stuff out of
the slab allocator and relying for this on the page allocator.
> If an allocation is intended for a particular node that the task does not
> have access to because of its cpuset, an allowed partial slab is used
> instead of failing.
This would get us back to the slab allocator enforcing memory policies.
> -static inline int node_match(struct kmem_cache_cpu *c, int node)
> +static inline int node_match(struct kmem_cache_cpu *c, int node, gfp_t gfpflags)
> {
> #ifdef CONFIG_NUMA
> if (node != -1 && c->node != node)
> return 0;
> #endif
> - return 1;
> + return cpuset_node_allowed_hardwall(c->node, gfpflags);
> }
This is a hotpath function and doing an expensive function call here would
significantly impact performance.
It also will cause a reloading of the per cpu slab after each task switch
in the scenario discussed above.
The solution that SLAB has for this scenario is to simply not use the
fastpath for off node allocations. This means all allocations that are not
on the current node always are going through slow path allocations.
On Tue, 3 Mar 2009, Christoph Lameter wrote:
> > Slab allocations should respect cpuset hardwall restrictions. Otherwise,
> > it is possible for tasks in a cpuset to fill slabs allocated on mems
> > assigned to a disjoint cpuset.
>
> Not sure that I understand this correctly. If multiple tasks are running
> on the same processor that are part of disjoint cpusets and both taska are
> performing slab allocations without specifying a node then one task could
> allocate a page from the first cpuset, take one object from it and then
> the second task on the same cpu could consume the rest from a nodeset that
> it would otherwise not be allowed to access. On the other hand it is
> likely that the second task will also allocate memory from its allowed
> nodes that are then consumed by the first task. This is a tradeoff coming
> with the pushing of the enforcement of memory policy / cpuset stuff out of
> the slab allocator and relying for this on the page allocator.
>
Yes, I agree that it's a significant optimization to allow the cpu slab to
be used by tasks that are not allowed, either because of its mempolicy or
cpuset restriction, to access the node on which it was allocated. That's
especially true for small object sizes or short-lived allocations where
the hardwall infringment is acceptable for the speed-up.
Unfortunately, it also leads to a violation of the user imposed
restriction on acceptable memory usage. One of the important aspects of
cpusets is to allow memory isolation from other siblings. It should be
possible to kill all tasks in a cpuset, for example, and expect its
partial list to be emptied and not heavily fragmented by long-lived
allocations that could prevent any partial slab freeing, which is possible
when heavy slab users are allowed to allocate objects anywhere.
> > If an allocation is intended for a particular node that the task does not
> > have access to because of its cpuset, an allowed partial slab is used
> > instead of failing.
>
> This would get us back to the slab allocator enforcing memory policies.
>
Is that a problem? get_any_partial() already enforces cpuset-aware memory
policies when defragmenting remote partial slabs.
> > -static inline int node_match(struct kmem_cache_cpu *c, int node)
> > +static inline int node_match(struct kmem_cache_cpu *c, int node, gfp_t gfpflags)
> > {
> > #ifdef CONFIG_NUMA
> > if (node != -1 && c->node != node)
> > return 0;
> > #endif
> > - return 1;
> > + return cpuset_node_allowed_hardwall(c->node, gfpflags);
> > }
>
> This is a hotpath function and doing an expensive function call here would
> significantly impact performance.
>
It's not expensive. It's a no-op for !CONFIG_CPUSETS configs and only a
global variable read for machines running with a single cpuset. When the
machine has multiple cpusets, it indicates that memory restrictions are in
place so checking current->mems_allowed is required and its performance
impact should be assumed.
> It also will cause a reloading of the per cpu slab after each task switch
> in the scenario discussed above.
>
There is no alternative solution to prevent egregious amounts of slab to
be allocated in a disjoint cpuset that is supposedly mem_exclusive.
On Tue, 3 Mar 2009, David Rientjes wrote:
> There is no alternative solution to prevent egregious amounts of slab to
> be allocated in a disjoint cpuset that is supposedly mem_exclusive.
The amount of memory is limited by the size of a slab page. If the process
goes beyond that amount then the page allocator will come in and enforce
the boundaries.
On Tue, 3 Mar 2009, Christoph Lameter wrote:
> > There is no alternative solution to prevent egregious amounts of slab to
> > be allocated in a disjoint cpuset that is supposedly mem_exclusive.
>
> The amount of memory is limited by the size of a slab page. If the process
> goes beyond that amount then the page allocator will come in and enforce
> the boundaries.
>
Not until all of the partial slabs on the local node are full, which could
be significant given s->min_partial.
This change doesn't affect the fastpath in any significant way for systems
that have not configured multiple cpusets; systems that have configured
more than one cpuset may have specific NUMA locality requirements that
slub is dismissing without this patch.
On Tue, 3 Mar 2009, David Rientjes wrote:
> This change doesn't affect the fastpath in any significant way for systems
> that have not configured multiple cpusets; systems that have configured
> more than one cpuset may have specific NUMA locality requirements that
> slub is dismissing without this patch.
SLUB is also "dismissing" lots of other NUMA locality requirements since
it relies on the page allocators for this. SLUB does *not* realize memory
policy and/or cpuset support for individual objects. NUMA locality is
implemented only (aside from explicit requests of memory from a
certain node) when slab page allocations are performed.
On Tue, 3 Mar 2009, Christoph Lameter wrote:
> SLUB is also "dismissing" lots of other NUMA locality requirements since
> it relies on the page allocators for this. SLUB does *not* realize memory
> policy and/or cpuset support for individual objects.
Right, and this patch adds that.
> NUMA locality is
> implemented only (aside from explicit requests of memory from a
> certain node) when slab page allocations are performed.
>
Yes, that is the current implementation.
For systems that use multiple cpusets, this allows objects for a task to
be allocated on its assigned cpuset node(s); the only reasonable use case
would be for memory isolation and/or NUMA optimizations.
Unfortunately, we can't add a slab hardwall flag to the cpuset to
configure this behavior since that would require locking to dereference
in the fastpath.
On Tue, Mar 3, 2009 at 1:35 PM, David Rientjes <[email protected]> wrote:
>
> Unfortunately, we can't add a slab hardwall flag to the cpuset to
> configure this behavior since that would require locking to dereference
> in the fastpath.
>
I don't think that it would - cgroups and subsystems should be RCU safe.
Paul
On Tue, 3 Mar 2009, Paul Menage wrote:
> > Unfortunately, we can't add a slab hardwall flag to the cpuset to
> > configure this behavior since that would require locking to dereference
> > in the fastpath.
> >
>
> I don't think that it would - cgroups and subsystems should be RCU safe.
>
That would help for cpusets that are looking for NUMA optimizations (i.e.
probably long-lived objects with local affinity) but would not ensure
memory isolation from tasks in other exclusive cpusets from allocating on
my slab.
So to address both NUMA and memory isolation, it seems like we'd need to
add a `slab_hardwall' flag that would have to be disabled for both cpusets
(the one hosting the cpu slab and the one allocating an object) to ignore
the hardwall requirement.
That isn't a very clean solution, but is certainly plausible if
Christoph's objection is that in the vast majority of multiple cpuset
systems it is far better to allocate on cpu slabs than for true memory
isolation or the ability to allocate an object on a specific node (for
which we currently have no solution) for affinity.
On Tue, Mar 3, 2009 at 2:29 PM, David Rientjes <[email protected]> wrote:
>
> That would help for cpusets that are looking for NUMA optimizations (i.e.
> probably long-lived objects with local affinity) but would not ensure
> memory isolation from tasks in other exclusive cpusets from allocating on
> my slab.
That would be the sysadmin's choice, if they set these other cpusets
with slab_hardwall=false.
Presumably in most cases all cpusets would have slab_hardwall set to
the same value.
Paul
On Tue, 3 Mar 2009, Paul Menage wrote:
> > That would help for cpusets that are looking for NUMA optimizations (i.e.
> > probably long-lived objects with local affinity) but would not ensure
> > memory isolation from tasks in other exclusive cpusets from allocating on
> > my slab.
>
> That would be the sysadmin's choice, if they set these other cpusets
> with slab_hardwall=false.
>
> Presumably in most cases all cpusets would have slab_hardwall set to
> the same value.
>
True, and this would have to be clearly documented in
Documentation/cgroups/cpusets.txt.
Christoph, would a `slab_hardwall' cpuset setting address your concerns?
On Tue, 3 Mar 2009, David Rientjes wrote:
> > Presumably in most cases all cpusets would have slab_hardwall set to
> > the same value.
>
> Christoph, would a `slab_hardwall' cpuset setting address your concerns?
That would make the per object memory policies in SLUB configurable? If
you can do that without regression and its clean then it would be
acceptable.
Again if you want per object memory policies in SLUB then it needs to be
added consistently. You would also f.e. have to check for an MPOL_BIND
condition where you check for cpuset nodes and make sure that __slab_alloc
goes round robin on MPOL_INTERLEAVE etc etc. You end up with a similar
nightmare implementation of that stuff as in SLAB. And as far as I know
this still has its issues since f.e. the MPOL_INTERLEAVE for objects is
fuzzing with the MPOL_INTERLEAVE node for pages which may result in
strange sequences of placement of pages on nodes because there were
intermediate allocations from slabs etc etc.
Memory policies and cpusets were initially designed to deal with page
allocations not with allocations of small objects. If you read the numactl
manpage then it becomes quite clear that we are dealing with page chunks
(look at the --touch or --strict options etc).
The intend is to spread memory in page chunks over NUMA nodes. That is
satisfied if the page allocations of the slab allocator are controllable
by memory policies and cpuset. And yes the page allocations may only
roughly correlate to the tasks that are consuming objects from shared
pools.
On Wed, 4 Mar 2009, Christoph Lameter wrote:
> That would make the per object memory policies in SLUB configurable? If
> you can do that without regression and its clean then it would be
> acceptable.
>
Yes, if `slab_hardwall' is enabled for a cpuset it will attempt to replace
the cpu slab with a partial slab on an allowed node.
> Again if you want per object memory policies in SLUB then it needs to be
> added consistently. You would also f.e. have to check for an MPOL_BIND
> condition where you check for cpuset nodes and make sure that __slab_alloc
> goes round robin on MPOL_INTERLEAVE etc etc. You end up with a similar
> nightmare implementation of that stuff as in SLAB.
I agree that we should avoid adding mempolicy support, especially
MPOL_INTERLEAVE at the object level since that would totally avoid the
fastpath for any task using it.