2016-11-08 23:06:49

by David Rientjes

[permalink] [raw]
Subject: [patch] mm, slab: faster active and free stats

From: Greg Thelen <[email protected]>

Reading /proc/slabinfo or monitoring slabtop(1) can become very expensive
if there are many slab caches and if there are very lengthy per-node
partial and/or free lists.

Commit 07a63c41fa1f ("mm/slab: improve performance of gathering slabinfo
stats") addressed the per-node full lists which showed a significant
improvement when no objects were freed. This patch has the same
motivation and optimizes the remainder of the usecases where there are
very lengthy partial and free lists.

This patch maintains per-node active_slabs (full and partial) and
free_slabs rather than iterating the lists at runtime when reading
/proc/slabinfo.

[[email protected]: changelog]
Signed-off-by: Greg Thelen <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
mm/slab.c | 117 +++++++++++++++++++++++++-------------------------------------
mm/slab.h | 3 +-
2 files changed, 49 insertions(+), 71 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -227,13 +227,14 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
INIT_LIST_HEAD(&parent->slabs_full);
INIT_LIST_HEAD(&parent->slabs_partial);
INIT_LIST_HEAD(&parent->slabs_free);
+ parent->active_slabs = 0;
+ parent->free_slabs = 0;
parent->shared = NULL;
parent->alien = NULL;
parent->colour_next = 0;
spin_lock_init(&parent->list_lock);
parent->free_objects = 0;
parent->free_touched = 0;
- parent->num_slabs = 0;
}

#define MAKE_LIST(cachep, listp, slab, nodeid) \
@@ -1366,7 +1367,6 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
{
#if DEBUG
struct kmem_cache_node *n;
- struct page *page;
unsigned long flags;
int node;
static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -1381,32 +1381,20 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
cachep->name, cachep->size, cachep->gfporder);

for_each_kmem_cache_node(cachep, node, n) {
- unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
- unsigned long active_slabs = 0, num_slabs = 0;
- unsigned long num_slabs_partial = 0, num_slabs_free = 0;
- unsigned long num_slabs_full;
+ unsigned long active_objs = 0, free_objs = 0;
+ unsigned long active_slabs, num_slabs;

spin_lock_irqsave(&n->list_lock, flags);
- num_slabs = n->num_slabs;
- list_for_each_entry(page, &n->slabs_partial, lru) {
- active_objs += page->active;
- num_slabs_partial++;
- }
- list_for_each_entry(page, &n->slabs_free, lru)
- num_slabs_free++;
+ active_slabs = n->active_slabs;
+ num_slabs = active_slabs + n->free_slabs;

- free_objects += n->free_objects;
+ active_objs += (num_slabs * cachep->num) - n->free_objects;
+ free_objs += n->free_objects;
spin_unlock_irqrestore(&n->list_lock, flags);

- num_objs = num_slabs * cachep->num;
- active_slabs = num_slabs - num_slabs_free;
- num_slabs_full = num_slabs -
- (num_slabs_partial + num_slabs_free);
- active_objs += (num_slabs_full * cachep->num);
-
pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
- node, active_slabs, num_slabs, active_objs, num_objs,
- free_objects);
+ node, active_slabs, num_slabs, active_objs,
+ num_slabs * cachep->num, free_objs);
}
#endif
}
@@ -2318,7 +2306,7 @@ static int drain_freelist(struct kmem_cache *cache,

page = list_entry(p, struct page, lru);
list_del(&page->lru);
- n->num_slabs--;
+ n->free_slabs--;
/*
* Safe to drop the lock. The slab is no longer linked
* to the cache.
@@ -2753,12 +2741,14 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
n = get_node(cachep, page_to_nid(page));

spin_lock(&n->list_lock);
- if (!page->active)
+ if (!page->active) {
list_add_tail(&page->lru, &(n->slabs_free));
- else
+ n->free_slabs++;
+ } else {
fixup_slab_list(cachep, n, page, &list);
+ n->active_slabs++;
+ }

- n->num_slabs++;
STATS_INC_GROWN(cachep);
n->free_objects += cachep->num - page->active;
spin_unlock(&n->list_lock);
@@ -2884,7 +2874,7 @@ static inline void fixup_slab_list(struct kmem_cache *cachep,

/* Try to find non-pfmemalloc slab if needed */
static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
- struct page *page, bool pfmemalloc)
+ struct page *page, bool *page_is_free, bool pfmemalloc)
{
if (!page)
return NULL;
@@ -2903,9 +2893,11 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,

/* Move pfmemalloc slab to the end of list to speed up next search */
list_del(&page->lru);
- if (!page->active)
+ if (*page_is_free) {
+ WARN_ON(page->active);
list_add_tail(&page->lru, &n->slabs_free);
- else
+ *page_is_free = false;
+ } else
list_add_tail(&page->lru, &n->slabs_partial);

list_for_each_entry(page, &n->slabs_partial, lru) {
@@ -2913,9 +2905,12 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
return page;
}

+ n->free_touched = 1;
list_for_each_entry(page, &n->slabs_free, lru) {
- if (!PageSlabPfmemalloc(page))
+ if (!PageSlabPfmemalloc(page)) {
+ *page_is_free = true;
return page;
+ }
}

return NULL;
@@ -2924,17 +2919,26 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
{
struct page *page;
+ bool page_is_free = false;

+ assert_spin_locked(&n->list_lock);
page = list_first_entry_or_null(&n->slabs_partial,
struct page, lru);
if (!page) {
n->free_touched = 1;
page = list_first_entry_or_null(&n->slabs_free,
struct page, lru);
+ if (page)
+ page_is_free = true;
}

if (sk_memalloc_socks())
- return get_valid_first_slab(n, page, pfmemalloc);
+ page = get_valid_first_slab(n, page, &page_is_free, pfmemalloc);
+
+ if (page && page_is_free) {
+ n->active_slabs++;
+ n->free_slabs--;
+ }

return page;
}
@@ -3434,9 +3438,11 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
STATS_DEC_ACTIVE(cachep);

/* fixup slab chains */
- if (page->active == 0)
+ if (page->active == 0) {
list_add(&page->lru, &n->slabs_free);
- else {
+ n->free_slabs++;
+ n->active_slabs--;
+ } else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
@@ -3450,7 +3456,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp,

page = list_last_entry(&n->slabs_free, struct page, lru);
list_move(&page->lru, list);
- n->num_slabs--;
+ n->free_slabs--;
}
}

@@ -4102,43 +4108,21 @@ static void cache_reap(struct work_struct *w)
#ifdef CONFIG_SLABINFO
void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
{
- struct page *page;
- unsigned long active_objs;
- unsigned long num_objs;
- unsigned long active_slabs = 0;
- unsigned long num_slabs, free_objects = 0, shared_avail = 0;
- unsigned long num_slabs_partial = 0, num_slabs_free = 0;
- unsigned long num_slabs_full = 0;
- const char *name;
- char *error = NULL;
+ unsigned long active_objs, num_objs, active_slabs;
+ unsigned long num_slabs = 0, free_objs = 0, shared_avail = 0;
+ unsigned long num_slabs_free = 0;
int node;
struct kmem_cache_node *n;

- active_objs = 0;
- num_slabs = 0;
for_each_kmem_cache_node(cachep, node, n) {
-
check_irq_on();
spin_lock_irq(&n->list_lock);

- num_slabs += n->num_slabs;
+ num_slabs += n->active_slabs + n->free_slabs;
+ num_slabs_free += n->free_slabs;

- list_for_each_entry(page, &n->slabs_partial, lru) {
- if (page->active == cachep->num && !error)
- error = "slabs_partial accounting error";
- if (!page->active && !error)
- error = "slabs_partial accounting error";
- active_objs += page->active;
- num_slabs_partial++;
- }
+ free_objs += n->free_objects;

- list_for_each_entry(page, &n->slabs_free, lru) {
- if (page->active && !error)
- error = "slabs_free accounting error";
- num_slabs_free++;
- }
-
- free_objects += n->free_objects;
if (n->shared)
shared_avail += n->shared->avail;

@@ -4146,15 +4130,8 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
}
num_objs = num_slabs * cachep->num;
active_slabs = num_slabs - num_slabs_free;
- num_slabs_full = num_slabs - (num_slabs_partial + num_slabs_free);
- active_objs += (num_slabs_full * cachep->num);

- if (num_objs - active_objs != free_objects && !error)
- error = "free_objects accounting error";
-
- name = cachep->name;
- if (error)
- pr_err("slab: cache %s error: %s\n", name, error);
+ active_objs = num_objs - free_objs;

sinfo->active_objs = active_objs;
sinfo->num_objs = num_objs;
diff --git a/mm/slab.h b/mm/slab.h
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -432,7 +432,8 @@ struct kmem_cache_node {
struct list_head slabs_partial; /* partial list first, better asm code */
struct list_head slabs_full;
struct list_head slabs_free;
- unsigned long num_slabs;
+ unsigned long active_slabs; /* length of slabs_partial+slabs_full */
+ unsigned long free_slabs; /* length of slabs_free */
unsigned long free_objects;
unsigned int free_limit;
unsigned int colour_next; /* Per-node cache coloring */


2016-11-08 23:17:31

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch] mm, slab: faster active and free stats

On Tue, 8 Nov 2016 15:06:45 -0800 (PST) David Rientjes <[email protected]> wrote:

> Reading /proc/slabinfo or monitoring slabtop(1) can become very expensive
> if there are many slab caches and if there are very lengthy per-node
> partial and/or free lists.
>
> Commit 07a63c41fa1f ("mm/slab: improve performance of gathering slabinfo
> stats") addressed the per-node full lists which showed a significant
> improvement when no objects were freed. This patch has the same
> motivation and optimizes the remainder of the usecases where there are
> very lengthy partial and free lists.
>
> This patch maintains per-node active_slabs (full and partial) and
> free_slabs rather than iterating the lists at runtime when reading
> /proc/slabinfo.

Are there any nice numbers you can share?

2016-11-10 00:38:13

by David Rientjes

[permalink] [raw]
Subject: Re: [patch] mm, slab: faster active and free stats

On Tue, 8 Nov 2016, Andrew Morton wrote:

> > Reading /proc/slabinfo or monitoring slabtop(1) can become very expensive
> > if there are many slab caches and if there are very lengthy per-node
> > partial and/or free lists.
> >
> > Commit 07a63c41fa1f ("mm/slab: improve performance of gathering slabinfo
> > stats") addressed the per-node full lists which showed a significant
> > improvement when no objects were freed. This patch has the same
> > motivation and optimizes the remainder of the usecases where there are
> > very lengthy partial and free lists.
> >
> > This patch maintains per-node active_slabs (full and partial) and
> > free_slabs rather than iterating the lists at runtime when reading
> > /proc/slabinfo.
>
> Are there any nice numbers you can share?
>

Yes, please add this to the description:


When allocating 100GB of slab from a test cache where every slab page is
on the partial list, reading /proc/slabinfo (includes all other slab
caches on the system) takes ~247ms on average with 48 samples.

As a result of this patch, the same read takes ~0.856ms on average.

2016-11-11 05:51:23

by Joonsoo Kim

[permalink] [raw]
Subject: Re: [patch] mm, slab: faster active and free stats

On Wed, Nov 09, 2016 at 04:38:08PM -0800, David Rientjes wrote:
> On Tue, 8 Nov 2016, Andrew Morton wrote:
>
> > > Reading /proc/slabinfo or monitoring slabtop(1) can become very expensive
> > > if there are many slab caches and if there are very lengthy per-node
> > > partial and/or free lists.
> > >
> > > Commit 07a63c41fa1f ("mm/slab: improve performance of gathering slabinfo
> > > stats") addressed the per-node full lists which showed a significant
> > > improvement when no objects were freed. This patch has the same
> > > motivation and optimizes the remainder of the usecases where there are
> > > very lengthy partial and free lists.
> > >
> > > This patch maintains per-node active_slabs (full and partial) and
> > > free_slabs rather than iterating the lists at runtime when reading
> > > /proc/slabinfo.
> >
> > Are there any nice numbers you can share?
> >
>
> Yes, please add this to the description:
>
>
> When allocating 100GB of slab from a test cache where every slab page is
> on the partial list, reading /proc/slabinfo (includes all other slab
> caches on the system) takes ~247ms on average with 48 samples.
>
> As a result of this patch, the same read takes ~0.856ms on average.

Hello, David.

Maintaining acitve/free_slab counters looks so complex. And, I think
that we don't need to maintain these counters for faster slabinfo.
Key point is to remove iterating n->slabs_partial list.

We can calculate active slab/object by following equation as you did in
this patch.

active_slab(n) = n->num_slab - the number of free_slab
active_object(n) = n->num_slab * cachep->num - n->free_objects

To get the number of free_slab, we need to iterate n->slabs_free list
but I guess it would be small enough.

If you don't like to iterate n->slabs_free list in slabinfo, just
maintaining the number of slabs_free would be enough.

Thanks.

2016-11-11 10:30:43

by David Rientjes

[permalink] [raw]
Subject: Re: [patch] mm, slab: faster active and free stats

On Fri, 11 Nov 2016, Joonsoo Kim wrote:

> Hello, David.
>
> Maintaining acitve/free_slab counters looks so complex. And, I think
> that we don't need to maintain these counters for faster slabinfo.
> Key point is to remove iterating n->slabs_partial list.
>
> We can calculate active slab/object by following equation as you did in
> this patch.
>
> active_slab(n) = n->num_slab - the number of free_slab
> active_object(n) = n->num_slab * cachep->num - n->free_objects
>
> To get the number of free_slab, we need to iterate n->slabs_free list
> but I guess it would be small enough.
>
> If you don't like to iterate n->slabs_free list in slabinfo, just
> maintaining the number of slabs_free would be enough.
>

Hi Joonsoo,

It's a good point, although I don't think the patch has overly complex
logic to keep track of slab state.

We don't prefer to do any iteration in get_slabinfo() since users can
read /proc/slabinfo constantly; it's better to just settle the stats when
slab state changes instead of repeating an expensive operation over and
over if someone is running slabtop(1) or /proc/slabinfo is scraped
regularly for stats.

That said, I imagine there are more clever ways to arrive at the same
answer, and you bring up a good point about maintaining a n->num_slabs and
n->free_slabs rather than n->active_slabs and n->free_slabs.

I don't feel strongly about either approach, but I think some improvement,
such as what this patch provides, is needed to prevent how expensive
simply reading /proc/slabinfo can be.

2016-11-28 07:37:04

by Joonsoo Kim

[permalink] [raw]
Subject: Re: [patch] mm, slab: faster active and free stats

On Fri, Nov 11, 2016 at 02:30:39AM -0800, David Rientjes wrote:
> On Fri, 11 Nov 2016, Joonsoo Kim wrote:
>
> > Hello, David.
> >
> > Maintaining acitve/free_slab counters looks so complex. And, I think
> > that we don't need to maintain these counters for faster slabinfo.
> > Key point is to remove iterating n->slabs_partial list.
> >
> > We can calculate active slab/object by following equation as you did in
> > this patch.
> >
> > active_slab(n) = n->num_slab - the number of free_slab
> > active_object(n) = n->num_slab * cachep->num - n->free_objects
> >
> > To get the number of free_slab, we need to iterate n->slabs_free list
> > but I guess it would be small enough.
> >
> > If you don't like to iterate n->slabs_free list in slabinfo, just
> > maintaining the number of slabs_free would be enough.
> >
>
> Hi Joonsoo,
>
> It's a good point, although I don't think the patch has overly complex
> logic to keep track of slab state.
>
> We don't prefer to do any iteration in get_slabinfo() since users can
> read /proc/slabinfo constantly; it's better to just settle the stats when
> slab state changes instead of repeating an expensive operation over and
> over if someone is running slabtop(1) or /proc/slabinfo is scraped
> regularly for stats.
>
> That said, I imagine there are more clever ways to arrive at the same
> answer, and you bring up a good point about maintaining a n->num_slabs and
> n->free_slabs rather than n->active_slabs and n->free_slabs.
>
> I don't feel strongly about either approach, but I think some improvement,
> such as what this patch provides, is needed to prevent how expensive
> simply reading /proc/slabinfo can be.

Hello,

Sorry for long delay.
I agree that this improvement is needed. Could you try the approach
that maintains n->num_slabs and n->free_slabs? I guess that it would be
simpler than this patch so more maintainable.

Thanks.

2016-11-30 00:56:52

by David Rientjes

[permalink] [raw]
Subject: Re: [patch] mm, slab: faster active and free stats

On Mon, 28 Nov 2016, Joonsoo Kim wrote:

> Hello,
>
> Sorry for long delay.
> I agree that this improvement is needed. Could you try the approach
> that maintains n->num_slabs and n->free_slabs? I guess that it would be
> simpler than this patch so more maintainable.
>

Ok, what do you think about the following? I'm not sure it's that much
more simpler.


mm, slab: track total number of slabs instead of active slabs

Rather than tracking the number of active slabs for each node, track the
total number of slabs. This is a minor improvement that avoids active
slab tracking when a slab goes from free to partial or partial to free.

Suggested-by: Joonsoo Kim <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
mm/slab.c | 48 +++++++++++++++++++++---------------------------
mm/slab.h | 4 ++--
2 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -227,7 +227,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
INIT_LIST_HEAD(&parent->slabs_full);
INIT_LIST_HEAD(&parent->slabs_partial);
INIT_LIST_HEAD(&parent->slabs_free);
- parent->active_slabs = 0;
+ parent->total_slabs = 0;
parent->free_slabs = 0;
parent->shared = NULL;
parent->alien = NULL;
@@ -1381,20 +1381,18 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
cachep->name, cachep->size, cachep->gfporder);

for_each_kmem_cache_node(cachep, node, n) {
- unsigned long active_objs = 0, free_objs = 0;
- unsigned long active_slabs, num_slabs;
+ unsigned long total_slabs, free_slabs, free_objs;

spin_lock_irqsave(&n->list_lock, flags);
- active_slabs = n->active_slabs;
- num_slabs = active_slabs + n->free_slabs;
-
- active_objs += (num_slabs * cachep->num) - n->free_objects;
- free_objs += n->free_objects;
+ total_slabs = n->total_slabs;
+ free_slabs = n->free_slabs;
+ free_objs = n->free_objects;
spin_unlock_irqrestore(&n->list_lock, flags);

- pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
- node, active_slabs, num_slabs, active_objs,
- num_slabs * cachep->num, free_objs);
+ pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
+ node, total_slabs - free_slabs, total_slabs,
+ (total_slabs * cachep->num) - free_objs,
+ total_slabs * cachep->num);
}
#endif
}
@@ -2307,6 +2305,7 @@ static int drain_freelist(struct kmem_cache *cache,
page = list_entry(p, struct page, lru);
list_del(&page->lru);
n->free_slabs--;
+ n->total_slabs--;
/*
* Safe to drop the lock. The slab is no longer linked
* to the cache.
@@ -2741,13 +2740,12 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
n = get_node(cachep, page_to_nid(page));

spin_lock(&n->list_lock);
+ n->total_slabs++;
if (!page->active) {
list_add_tail(&page->lru, &(n->slabs_free));
n->free_slabs++;
- } else {
+ } else
fixup_slab_list(cachep, n, page, &list);
- n->active_slabs++;
- }

STATS_INC_GROWN(cachep);
n->free_objects += cachep->num - page->active;
@@ -2935,10 +2933,8 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
if (sk_memalloc_socks())
page = get_valid_first_slab(n, page, &page_is_free, pfmemalloc);

- if (page && page_is_free) {
- n->active_slabs++;
+ if (page && page_is_free)
n->free_slabs--;
- }

return page;
}
@@ -3441,7 +3437,6 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
if (page->active == 0) {
list_add(&page->lru, &n->slabs_free);
n->free_slabs++;
- n->active_slabs--;
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
@@ -3457,6 +3452,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
page = list_last_entry(&n->slabs_free, struct page, lru);
list_move(&page->lru, list);
n->free_slabs--;
+ n->total_slabs--;
}
}

@@ -4109,8 +4105,8 @@ static void cache_reap(struct work_struct *w)
void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
{
unsigned long active_objs, num_objs, active_slabs;
- unsigned long num_slabs = 0, free_objs = 0, shared_avail = 0;
- unsigned long num_slabs_free = 0;
+ unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0;
+ unsigned long free_slabs = 0;
int node;
struct kmem_cache_node *n;

@@ -4118,9 +4114,8 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
check_irq_on();
spin_lock_irq(&n->list_lock);

- num_slabs += n->active_slabs + n->free_slabs;
- num_slabs_free += n->free_slabs;
-
+ total_slabs += n->total_slabs;
+ free_slabs += n->free_slabs;
free_objs += n->free_objects;

if (n->shared)
@@ -4128,15 +4123,14 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)

spin_unlock_irq(&n->list_lock);
}
- num_objs = num_slabs * cachep->num;
- active_slabs = num_slabs - num_slabs_free;
-
+ num_objs = total_slabs * cachep->num;
+ active_slabs = total_slabs - free_slabs;
active_objs = num_objs - free_objs;

sinfo->active_objs = active_objs;
sinfo->num_objs = num_objs;
sinfo->active_slabs = active_slabs;
- sinfo->num_slabs = num_slabs;
+ sinfo->num_slabs = total_slabs;
sinfo->shared_avail = shared_avail;
sinfo->limit = cachep->limit;
sinfo->batchcount = cachep->batchcount;
diff --git a/mm/slab.h b/mm/slab.h
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -432,8 +432,8 @@ struct kmem_cache_node {
struct list_head slabs_partial; /* partial list first, better asm code */
struct list_head slabs_full;
struct list_head slabs_free;
- unsigned long active_slabs; /* length of slabs_partial+slabs_full */
- unsigned long free_slabs; /* length of slabs_free */
+ unsigned long total_slabs; /* length of all slab lists */
+ unsigned long free_slabs; /* length of free slab list only */
unsigned long free_objects;
unsigned int free_limit;
unsigned int colour_next; /* Per-node cache coloring */

2016-12-02 08:03:05

by Joonsoo Kim

[permalink] [raw]
Subject: RE: [patch] mm, slab: faster active and free stats

Hello, David.

There is some problem on my e-mail client so I have to use another one.
Please understand broken reply style.

Yeah, I like this version much. Can we do account slabs_free directly in get_first_slab()
and get_valid_first_slab()? Passing page_is_free isn't needed if we do it directly in
those functions.

One nitpick is that if we don't replace variable name, num_slabs with total_slabs, we will
get less churn the code. However, total_slabs looks better than num_slabs.

Thanks.

-----Original Message-----
From: David Rientjes [mailto:[email protected]]
Sent: Wednesday, November 30, 2016 9:57 AM
To: Joonsoo Kim <[email protected]>
Cc: Andrew Morton <[email protected]>; Greg Thelen <[email protected]>; Aruna Ramakrishna <[email protected]>; Christoph Lameter <[email protected]>; [email protected]; [email protected]
Subject: Re: [patch] mm, slab: faster active and free stats

On Mon, 28 Nov 2016, Joonsoo Kim wrote:

> Hello,
>
> Sorry for long delay.
> I agree that this improvement is needed. Could you try the approach
> that maintains n->num_slabs and n->free_slabs? I guess that it would
> be simpler than this patch so more maintainable.
>

Ok, what do you think about the following? I'm not sure it's that much more simpler.


mm, slab: track total number of slabs instead of active slabs

Rather than tracking the number of active slabs for each node, track the
total number of slabs. This is a minor improvement that avoids active
slab tracking when a slab goes from free to partial or partial to free.

Suggested-by: Joonsoo Kim <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
mm/slab.c | 48 +++++++++++++++++++++---------------------------
mm/slab.h | 4 ++--
2 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -227,7 +227,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
INIT_LIST_HEAD(&parent->slabs_full);
INIT_LIST_HEAD(&parent->slabs_partial);
INIT_LIST_HEAD(&parent->slabs_free);
- parent->active_slabs = 0;
+ parent->total_slabs = 0;
parent->free_slabs = 0;
parent->shared = NULL;
parent->alien = NULL;
@@ -1381,20 +1381,18 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
cachep->name, cachep->size, cachep->gfporder);

for_each_kmem_cache_node(cachep, node, n) {
- unsigned long active_objs = 0, free_objs = 0;
- unsigned long active_slabs, num_slabs;
+ unsigned long total_slabs, free_slabs, free_objs;

spin_lock_irqsave(&n->list_lock, flags);
- active_slabs = n->active_slabs;
- num_slabs = active_slabs + n->free_slabs;
-
- active_objs += (num_slabs * cachep->num) - n->free_objects;
- free_objs += n->free_objects;
+ total_slabs = n->total_slabs;
+ free_slabs = n->free_slabs;
+ free_objs = n->free_objects;
spin_unlock_irqrestore(&n->list_lock, flags);

- pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
- node, active_slabs, num_slabs, active_objs,
- num_slabs * cachep->num, free_objs);
+ pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
+ node, total_slabs - free_slabs, total_slabs,
+ (total_slabs * cachep->num) - free_objs,
+ total_slabs * cachep->num);
}
#endif
}
@@ -2307,6 +2305,7 @@ static int drain_freelist(struct kmem_cache *cache,
page = list_entry(p, struct page, lru);
list_del(&page->lru);
n->free_slabs--;
+ n->total_slabs--;
/*
* Safe to drop the lock. The slab is no longer linked
* to the cache.
@@ -2741,13 +2740,12 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
n = get_node(cachep, page_to_nid(page));

spin_lock(&n->list_lock);
+ n->total_slabs++;
if (!page->active) {
list_add_tail(&page->lru, &(n->slabs_free));
n->free_slabs++;
- } else {
+ } else
fixup_slab_list(cachep, n, page, &list);
- n->active_slabs++;
- }

STATS_INC_GROWN(cachep);
n->free_objects += cachep->num - page->active;
@@ -2935,10 +2933,8 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
if (sk_memalloc_socks())
page = get_valid_first_slab(n, page, &page_is_free, pfmemalloc);

- if (page && page_is_free) {
- n->active_slabs++;
+ if (page && page_is_free)
n->free_slabs--;
- }

return page;
}
@@ -3441,7 +3437,6 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
if (page->active == 0) {
list_add(&page->lru, &n->slabs_free);
n->free_slabs++;
- n->active_slabs--;
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
@@ -3457,6 +3452,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
page = list_last_entry(&n->slabs_free, struct page, lru);
list_move(&page->lru, list);
n->free_slabs--;
+ n->total_slabs--;
}
}

@@ -4109,8 +4105,8 @@ static void cache_reap(struct work_struct *w)
void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
{
unsigned long active_objs, num_objs, active_slabs;
- unsigned long num_slabs = 0, free_objs = 0, shared_avail = 0;
- unsigned long num_slabs_free = 0;
+ unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0;
+ unsigned long free_slabs = 0;
int node;
struct kmem_cache_node *n;

@@ -4118,9 +4114,8 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
check_irq_on();
spin_lock_irq(&n->list_lock);

- num_slabs += n->active_slabs + n->free_slabs;
- num_slabs_free += n->free_slabs;
-
+ total_slabs += n->total_slabs;
+ free_slabs += n->free_slabs;
free_objs += n->free_objects;

if (n->shared)
@@ -4128,15 +4123,14 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)

spin_unlock_irq(&n->list_lock);
}
- num_objs = num_slabs * cachep->num;
- active_slabs = num_slabs - num_slabs_free;
-
+ num_objs = total_slabs * cachep->num;
+ active_slabs = total_slabs - free_slabs;
active_objs = num_objs - free_objs;

sinfo->active_objs = active_objs;
sinfo->num_objs = num_objs;
sinfo->active_slabs = active_slabs;
- sinfo->num_slabs = num_slabs;
+ sinfo->num_slabs = total_slabs;
sinfo->shared_avail = shared_avail;
sinfo->limit = cachep->limit;
sinfo->batchcount = cachep->batchcount;
diff --git a/mm/slab.h b/mm/slab.h
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -432,8 +432,8 @@ struct kmem_cache_node {
struct list_head slabs_partial; /* partial list first, better asm code */
struct list_head slabs_full;
struct list_head slabs_free;
- unsigned long active_slabs; /* length of slabs_partial+slabs_full */
- unsigned long free_slabs; /* length of slabs_free */
+ unsigned long total_slabs; /* length of all slab lists */
+ unsigned long free_slabs; /* length of free slab list only */
unsigned long free_objects;
unsigned int free_limit;
unsigned int colour_next; /* Per-node cache coloring */

2016-12-05 04:23:16

by David Rientjes

[permalink] [raw]
Subject: [patch -mm] mm, slab: maintain total slab count instead of active count

Rather than tracking the number of active slabs for each node, track the
total number of slabs. This is a minor improvement that avoids active
slab tracking when a slab goes from free to partial or partial to free.

For slab debugging, this also removes an explicit free count since it can
easily be inferred by the difference in number of total objects and number
of active objects.

Suggested-by: Joonsoo Kim <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
For -mm because this depends on
mm-slab-faster-active-and-free-stats.patch

mm/slab.c | 70 ++++++++++++++++++++++++++-------------------------------------
mm/slab.h | 4 ++--
2 files changed, 31 insertions(+), 43 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -227,7 +227,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
INIT_LIST_HEAD(&parent->slabs_full);
INIT_LIST_HEAD(&parent->slabs_partial);
INIT_LIST_HEAD(&parent->slabs_free);
- parent->active_slabs = 0;
+ parent->total_slabs = 0;
parent->free_slabs = 0;
parent->shared = NULL;
parent->alien = NULL;
@@ -1376,20 +1376,18 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
cachep->name, cachep->size, cachep->gfporder);

for_each_kmem_cache_node(cachep, node, n) {
- unsigned long active_objs = 0, free_objs = 0;
- unsigned long active_slabs, num_slabs;
+ unsigned long total_slabs, free_slabs, free_objs;

spin_lock_irqsave(&n->list_lock, flags);
- active_slabs = n->active_slabs;
- num_slabs = active_slabs + n->free_slabs;
-
- active_objs += (num_slabs * cachep->num) - n->free_objects;
- free_objs += n->free_objects;
+ total_slabs = n->total_slabs;
+ free_slabs = n->free_slabs;
+ free_objs = n->free_objects;
spin_unlock_irqrestore(&n->list_lock, flags);

- pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
- node, active_slabs, num_slabs, active_objs,
- num_slabs * cachep->num, free_objs);
+ pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
+ node, total_slabs - free_slabs, total_slabs,
+ (total_slabs * cachep->num) - free_objs,
+ total_slabs * cachep->num);
}
#endif
}
@@ -2302,6 +2300,7 @@ static int drain_freelist(struct kmem_cache *cache,
page = list_entry(p, struct page, lru);
list_del(&page->lru);
n->free_slabs--;
+ n->total_slabs--;
/*
* Safe to drop the lock. The slab is no longer linked
* to the cache.
@@ -2736,13 +2735,12 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
n = get_node(cachep, page_to_nid(page));

spin_lock(&n->list_lock);
+ n->total_slabs++;
if (!page->active) {
list_add_tail(&page->lru, &(n->slabs_free));
n->free_slabs++;
- } else {
+ } else
fixup_slab_list(cachep, n, page, &list);
- n->active_slabs++;
- }

STATS_INC_GROWN(cachep);
n->free_objects += cachep->num - page->active;
@@ -2869,7 +2867,7 @@ static inline void fixup_slab_list(struct kmem_cache *cachep,

/* Try to find non-pfmemalloc slab if needed */
static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
- struct page *page, bool *page_is_free, bool pfmemalloc)
+ struct page *page, bool pfmemalloc)
{
if (!page)
return NULL;
@@ -2888,10 +2886,9 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,

/* Move pfmemalloc slab to the end of list to speed up next search */
list_del(&page->lru);
- if (*page_is_free) {
- WARN_ON(page->active);
+ if (!page->active) {
list_add_tail(&page->lru, &n->slabs_free);
- *page_is_free = false;
+ n->free_slabs++;
} else
list_add_tail(&page->lru, &n->slabs_partial);

@@ -2903,7 +2900,7 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
n->free_touched = 1;
list_for_each_entry(page, &n->slabs_free, lru) {
if (!PageSlabPfmemalloc(page)) {
- *page_is_free = true;
+ n->free_slabs--;
return page;
}
}
@@ -2914,26 +2911,19 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
{
struct page *page;
- bool page_is_free = false;

assert_spin_locked(&n->list_lock);
- page = list_first_entry_or_null(&n->slabs_partial,
- struct page, lru);
+ page = list_first_entry_or_null(&n->slabs_partial, struct page, lru);
if (!page) {
n->free_touched = 1;
- page = list_first_entry_or_null(&n->slabs_free,
- struct page, lru);
+ page = list_first_entry_or_null(&n->slabs_free, struct page,
+ lru);
if (page)
- page_is_free = true;
+ n->free_slabs--;
}

if (sk_memalloc_socks())
- page = get_valid_first_slab(n, page, &page_is_free, pfmemalloc);
-
- if (page && page_is_free) {
- n->active_slabs++;
- n->free_slabs--;
- }
+ page = get_valid_first_slab(n, page, pfmemalloc);

return page;
}
@@ -3436,7 +3426,6 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
if (page->active == 0) {
list_add(&page->lru, &n->slabs_free);
n->free_slabs++;
- n->active_slabs--;
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
@@ -3452,6 +3441,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
page = list_last_entry(&n->slabs_free, struct page, lru);
list_move(&page->lru, list);
n->free_slabs--;
+ n->total_slabs--;
}
}

@@ -4104,8 +4094,8 @@ static void cache_reap(struct work_struct *w)
void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
{
unsigned long active_objs, num_objs, active_slabs;
- unsigned long num_slabs = 0, free_objs = 0, shared_avail = 0;
- unsigned long num_slabs_free = 0;
+ unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0;
+ unsigned long free_slabs = 0;
int node;
struct kmem_cache_node *n;

@@ -4113,9 +4103,8 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
check_irq_on();
spin_lock_irq(&n->list_lock);

- num_slabs += n->active_slabs + n->free_slabs;
- num_slabs_free += n->free_slabs;
-
+ total_slabs += n->total_slabs;
+ free_slabs += n->free_slabs;
free_objs += n->free_objects;

if (n->shared)
@@ -4123,15 +4112,14 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)

spin_unlock_irq(&n->list_lock);
}
- num_objs = num_slabs * cachep->num;
- active_slabs = num_slabs - num_slabs_free;
-
+ num_objs = total_slabs * cachep->num;
+ active_slabs = total_slabs - free_slabs;
active_objs = num_objs - free_objs;

sinfo->active_objs = active_objs;
sinfo->num_objs = num_objs;
sinfo->active_slabs = active_slabs;
- sinfo->num_slabs = num_slabs;
+ sinfo->num_slabs = total_slabs;
sinfo->shared_avail = shared_avail;
sinfo->limit = cachep->limit;
sinfo->batchcount = cachep->batchcount;
diff --git a/mm/slab.h b/mm/slab.h
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -447,8 +447,8 @@ struct kmem_cache_node {
struct list_head slabs_partial; /* partial list first, better asm code */
struct list_head slabs_full;
struct list_head slabs_free;
- unsigned long active_slabs; /* length of slabs_partial+slabs_full */
- unsigned long free_slabs; /* length of slabs_free */
+ unsigned long total_slabs; /* length of all slab lists */
+ unsigned long free_slabs; /* length of free slab list only */
unsigned long free_objects;
unsigned int free_limit;
unsigned int colour_next; /* Per-node cache coloring */