2006-03-03 23:53:13

by Christoph Lameter

[permalink] [raw]
Subject: cache_reap(): Further reduction in interrupt holdoff

cache_reap takes the l3->list_lock (disabling interrupts) unconditionally and
then does a few checks and maybe does some cleanup. This patch makes
cache_reap() only take the lock if there is work to do and then the lock
is taken and released for each cleaning action.

The checking of when to do the next reaping is done without any locking
and becomes racy. Should not matter since reaping can also be skipped
if the slab mutex cannot be acquired.

The same is true for the touched processing. If we get this wrong once
in awhile then we will mistakenly clean or not clean the shared cache.
This will impact performance slightly.

Note that the additional drain_array() function introduced here
will fall out in a subsequent patch since array cleaning will now be
very similar from all callers.

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.16-rc5-mm2/mm/slab.c
===================================================================
--- linux-2.6.16-rc5-mm2.orig/mm/slab.c 2006-03-03 07:50:57.000000000 -0800
+++ linux-2.6.16-rc5-mm2/mm/slab.c 2006-03-03 15:32:49.000000000 -0800
@@ -293,13 +293,13 @@ struct kmem_list3 {
struct list_head slabs_full;
struct list_head slabs_free;
unsigned long free_objects;
- unsigned long next_reap;
- int free_touched;
unsigned int free_limit;
unsigned int colour_next; /* Per-node cache coloring */
spinlock_t list_lock;
struct array_cache *shared; /* shared per node */
struct array_cache **alien; /* on other nodes */
+ unsigned long next_reap; /* updated without locking */
+ int free_touched; /* updated without locking */
};

/*
@@ -3572,6 +3572,22 @@ static void drain_array_locked(struct km
}
}

+
+/*
+ * Drain an array if it contains any elements taking the l3 lock only if
+ * necessary.
+ */
+void drain_array(struct kmem_cache *searchp, struct kmem_list3 *l3,
+ struct array_cache *ac)
+{
+ if (ac && ac->avail) {
+ spin_lock_irq(&l3->list_lock);
+ drain_array_locked(searchp, ac, 0,
+ numa_node_id());
+ spin_unlock_irq(&l3->list_lock);
+ }
+}
+
/**
* cache_reap - Reclaim memory from caches.
* @unused: unused parameter
@@ -3605,33 +3621,48 @@ static void cache_reap(void *unused)
searchp = list_entry(walk, struct kmem_cache, next);
check_irq_on();

+ /*
+ * We only take the l3 lock if absolutely necessary and we
+ * have established with reasonable certainty that
+ * we can do some work if the lock was obtained.
+ */
l3 = searchp->nodelists[numa_node_id()];
+
reap_alien(searchp, l3);
- spin_lock_irq(&l3->list_lock);

- drain_array_locked(searchp, cpu_cache_get(searchp), 0,
- numa_node_id());
+ drain_array(searchp, l3, cpu_cache_get(searchp));

+ /*
+ * These are racy checks but it does not matter
+ * if we skip one check or scan twice.
+ */
if (time_after(l3->next_reap, jiffies))
- goto next_unlock;
+ goto next;

l3->next_reap = jiffies + REAPTIMEOUT_LIST3;

- if (l3->shared)
- drain_array_locked(searchp, l3->shared, 0,
- numa_node_id());
+ drain_array(searchp, l3, l3->shared);

if (l3->free_touched) {
l3->free_touched = 0;
- goto next_unlock;
+ goto next;
}

tofree = (l3->free_limit + 5 * searchp->num - 1) /
(5 * searchp->num);
do {
+ /*
+ * Do not lock if there are no free blocks.
+ */
+ if (list_empty(&l3->slabs_free))
+ break;
+
+ spin_lock_irq(&l3->list_lock);
p = l3->slabs_free.next;
- if (p == &(l3->slabs_free))
+ if (p == &(l3->slabs_free)) {
+ spin_unlock_irq(&l3->list_lock);
break;
+ }

slabp = list_entry(p, struct slab, list);
BUG_ON(slabp->inuse);
@@ -3646,10 +3677,8 @@ static void cache_reap(void *unused)
l3->free_objects -= searchp->num;
spin_unlock_irq(&l3->list_lock);
slab_destroy(searchp, slabp);
- spin_lock_irq(&l3->list_lock);
} while (--tofree > 0);
-next_unlock:
- spin_unlock_irq(&l3->list_lock);
+next:
cond_resched();
}
check_irq_on();


2006-03-04 00:50:49

by Christoph Lameter

[permalink] [raw]
Subject: Make drain_array more universal by adding more parameters

And a parameter to drain_array to control the freeing of all objects
and then use drain_array() to replace instances of drain_array_locked
with drain_array. Doing so will avoid taking locks in those locations
if the arrays are empty.

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.16-rc5-mm2/mm/slab.c
===================================================================
--- linux-2.6.16-rc5-mm2.orig/mm/slab.c 2006-03-03 16:14:49.000000000 -0800
+++ linux-2.6.16-rc5-mm2/mm/slab.c 2006-03-03 16:21:48.000000000 -0800
@@ -2128,6 +2128,10 @@ static void check_spinlock_acquired_node
static void drain_array_locked(struct kmem_cache *cachep,
struct array_cache *ac, int force, int node);

+static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+ struct array_cache *ac,
+ int force, int node);
+
static void do_drain(void *arg)
{
struct kmem_cache *cachep = arg;
@@ -2152,9 +2156,7 @@ static void drain_cpu_caches(struct kmem
for_each_online_node(node) {
l3 = cachep->nodelists[node];
if (l3) {
- spin_lock_irq(&l3->list_lock);
- drain_array_locked(cachep, l3->shared, 1, node);
- spin_unlock_irq(&l3->list_lock);
+ drain_array(cachep, l3, l3->shared, 1, node);
if (l3->alien)
drain_alien_cache(cachep, l3->alien);
}
@@ -3578,12 +3579,11 @@ static void drain_array_locked(struct km
* necessary.
*/
void drain_array(struct kmem_cache *searchp, struct kmem_list3 *l3,
- struct array_cache *ac)
+ struct array_cache *ac, int force, int node)
{
if (ac && ac->avail) {
spin_lock_irq(&l3->list_lock);
- drain_array_locked(searchp, ac, 0,
- numa_node_id());
+ drain_array_locked(searchp, ac, force, node);
spin_unlock_irq(&l3->list_lock);
}
}
@@ -3604,6 +3604,7 @@ static void cache_reap(void *unused)
{
struct list_head *walk;
struct kmem_list3 *l3;
+ int node = numa_node_id();

if (!mutex_trylock(&cache_chain_mutex)) {
/* Give up. Setup the next iteration. */
@@ -3626,11 +3627,11 @@ static void cache_reap(void *unused)
* have established with reasonable certainty that
* we can do some work if the lock was obtained.
*/
- l3 = searchp->nodelists[numa_node_id()];
+ l3 = searchp->nodelists[node];

reap_alien(searchp, l3);

- drain_array(searchp, l3, cpu_cache_get(searchp));
+ drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);

/*
* These are racy checks but it does not matter
@@ -3641,7 +3642,7 @@ static void cache_reap(void *unused)

l3->next_reap = jiffies + REAPTIMEOUT_LIST3;

- drain_array(searchp, l3, l3->shared);
+ drain_array(searchp, l3, l3->shared, 0, node);

if (l3->free_touched) {
l3->free_touched = 0;

2006-03-04 00:51:41

by Christoph Lameter

[permalink] [raw]
Subject: slab: remove drain_array_locked

Remove drain_array_locked and use that opportunity to limit the
time the l3 lock is taken further.

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.16-rc5-mm2/mm/slab.c
===================================================================
--- linux-2.6.16-rc5-mm2.orig/mm/slab.c 2006-03-03 16:31:40.000000000 -0800
+++ linux-2.6.16-rc5-mm2/mm/slab.c 2006-03-03 16:43:43.000000000 -0800
@@ -2125,9 +2125,6 @@ static void check_spinlock_acquired_node
#define check_spinlock_acquired_node(x, y) do { } while(0)
#endif

-static void drain_array_locked(struct kmem_cache *cachep,
- struct array_cache *ac, int force, int node);
-
static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
struct array_cache *ac,
int force, int node);
@@ -3555,40 +3552,33 @@ static void enable_cpucache(struct kmem_
cachep->name, -err);
}

-static void drain_array_locked(struct kmem_cache *cachep,
- struct array_cache *ac, int force, int node)
+/*
+ * Drain an array if it contains any elements taking the l3 lock only if
+ * necessary.
+ */
+void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+ struct array_cache *ac, int force, int node)
{
int tofree;

- check_spinlock_acquired_node(cachep, node);
+ if (!ac || !ac->avail)
+ return;
+
if (ac->touched && !force) {
ac->touched = 0;
} else if (ac->avail) {
tofree = force ? ac->avail : (ac->limit + 4) / 5;
if (tofree > ac->avail)
tofree = (ac->avail + 1) / 2;
+ spin_lock_irq(&l3->list_lock);
free_block(cachep, ac->entry, tofree, node);
+ spin_unlock_irq(&l3->list_lock);
ac->avail -= tofree;
memmove(ac->entry, &(ac->entry[tofree]),
sizeof(void *) * ac->avail);
}
}

-
-/*
- * Drain an array if it contains any elements taking the l3 lock only if
- * necessary.
- */
-void drain_array(struct kmem_cache *searchp, struct kmem_list3 *l3,
- struct array_cache *ac, int force, int node)
-{
- if (ac && ac->avail) {
- spin_lock_irq(&l3->list_lock);
- drain_array_locked(searchp, ac, force, node);
- spin_unlock_irq(&l3->list_lock);
- }
-}
-
/**
* cache_reap - Reclaim memory from caches.
* @unused: unused parameter

2006-03-04 02:45:09

by Christoph Lameter

[permalink] [raw]
Subject: Do not disable interrupts for off node freeing

Add a free_block_lock function and remove the node parameter
from free_block. That one can be determined from slabp.

Optimize interrupt holdoff in do_tune_cpucache() and in drain_cache.
Only disable interrupts if we are freeing to the local node.

[Umm... Maybe we ought to think about that a bit more... Seems to be
okay to me but I am very fallible....]

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.16-rc5-mm2/mm/slab.c
===================================================================
--- linux-2.6.16-rc5-mm2.orig/mm/slab.c 2006-03-03 16:43:43.000000000 -0800
+++ linux-2.6.16-rc5-mm2/mm/slab.c 2006-03-03 18:41:11.000000000 -0800
@@ -701,12 +701,22 @@ static enum {

static DEFINE_PER_CPU(struct work_struct, reap_work);

-static void free_block(struct kmem_cache *cachep, void **objpp, int len,
- int node);
+static void free_block(struct kmem_cache *cachep, void **objpp, int len);
static void enable_cpucache(struct kmem_cache *cachep);
static void cache_reap(void *unused);
static int __node_shrink(struct kmem_cache *cachep, int node);

+/*
+ * Interrupts must be off unless we free to a remote node
+ */
+static void free_block_lock(struct kmem_cache *cachep, struct kmem_list3 *l3,
+ void **objp, int len)
+{
+ spin_lock(&l3->list_lock);
+ free_block(cachep, objp, len);
+ spin_unlock(&l3->list_lock);
+}
+
static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
{
return cachep->array[smp_processor_id()];
@@ -948,10 +958,8 @@ static void __drain_alien_cache(struct k
struct kmem_list3 *rl3 = cachep->nodelists[node];

if (ac->avail) {
- spin_lock(&rl3->list_lock);
- free_block(cachep, ac->entry, ac->avail, node);
- ac->avail = 0;
- spin_unlock(&rl3->list_lock);
+ free_block_lock(cachep, rl3, ac->entry, ac->avail);
+ ac->avail=0;
}
}

@@ -1135,7 +1143,7 @@ static int __devinit cpuup_callback(stru
/* Free limit for this kmem_list3 */
l3->free_limit -= cachep->batchcount;
if (nc)
- free_block(cachep, nc->entry, nc->avail, node);
+ free_block(cachep, nc->entry, nc->avail);

if (!cpus_empty(mask)) {
spin_unlock_irq(&l3->list_lock);
@@ -1145,7 +1153,7 @@ static int __devinit cpuup_callback(stru
shared = l3->shared;
if (shared) {
free_block(cachep, l3->shared->entry,
- l3->shared->avail, node);
+ l3->shared->avail);
l3->shared = NULL;
}

@@ -2137,9 +2145,8 @@ static void do_drain(void *arg)

check_irq_off();
ac = cpu_cache_get(cachep);
- spin_lock(&cachep->nodelists[node]->list_lock);
- free_block(cachep, ac->entry, ac->avail, node);
- spin_unlock(&cachep->nodelists[node]->list_lock);
+ free_block_lock(cachep, cachep->nodelists[node],
+ ac->entry, ac->avail);
ac->avail = 0;
}

@@ -2945,8 +2952,8 @@ done:
/*
* Caller needs to acquire correct kmem_list's list_lock
*/
-static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
- int node)
+static void free_block(struct kmem_cache *cachep, void **objpp,
+ int nr_objects)
{
int i;
struct kmem_list3 *l3;
@@ -2954,8 +2961,10 @@ static void free_block(struct kmem_cache
for (i = 0; i < nr_objects; i++) {
void *objp = objpp[i];
struct slab *slabp;
+ int node;

slabp = virt_to_slab(objp);
+ node = slabp->nodeid;
l3 = cachep->nodelists[node];
list_del(&slabp->list);
check_spinlock_acquired_node(cachep, node);
@@ -3009,7 +3018,7 @@ static void cache_flusharray(struct kmem
}
}

- free_block(cachep, ac->entry, batchcount, node);
+ free_block(cachep, ac->entry, batchcount);
free_done:
#if STATS
{
@@ -3067,13 +3076,10 @@ static inline void __cache_free(struct k
alien, nodeid);
alien->entry[alien->avail++] = objp;
spin_unlock(&alien->lock);
- } else {
- spin_lock(&(cachep->nodelists[nodeid])->
- list_lock);
- free_block(cachep, &objp, 1, nodeid);
- spin_unlock(&(cachep->nodelists[nodeid])->
- list_lock);
- }
+ } else
+ free_block_lock(cachep,
+ cachep->nodelists[nodeid],
+ &objp, 1);
return;
}
}
@@ -3402,7 +3408,7 @@ static int alloc_kmemlist(struct kmem_ca

nc = cachep->nodelists[node]->shared;
if (nc)
- free_block(cachep, nc->entry, nc->avail, node);
+ free_block(cachep, nc->entry, nc->avail);

l3->shared = new;
if (!cachep->nodelists[node]->alien) {
@@ -3480,11 +3486,25 @@ static int do_tune_cpucache(struct kmem_

for_each_online_cpu(i) {
struct array_cache *ccold = new.new[i];
+ int node = cpu_to_node(i);
+
if (!ccold)
continue;
- spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
- free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
- spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
+
+ /*
+ * Local interrupts must be disabled only for
+ * this node because the list_lock may be obtained
+ * in an interrupt context.
+ */
+ if (node == numa_node_id())
+ local_irq_disable();
+
+ free_block_lock(cachep, cachep->nodelists[node],
+ ccold->entry, ccold->avail);
+
+ if (node == numa_node_id())
+ local_irq_enable();
+
kfree(ccold);
}

@@ -3570,9 +3590,18 @@ void drain_array(struct kmem_cache *cach
tofree = force ? ac->avail : (ac->limit + 4) / 5;
if (tofree > ac->avail)
tofree = (ac->avail + 1) / 2;
- spin_lock_irq(&l3->list_lock);
- free_block(cachep, ac->entry, tofree, node);
- spin_unlock_irq(&l3->list_lock);
+ /*
+ * Freeing to the local node also requires that
+ * interrupts be disabled.
+ */
+ if (node == numa_node_id())
+ local_irq_disable();
+
+ free_block_lock(cachep, l3, ac->entry, tofree);
+
+ if (node == numa_node_id())
+ local_irq_enable();
+
ac->avail -= tofree;
memmove(ac->entry, &(ac->entry[tofree]),
sizeof(void *) * ac->avail);

2006-03-06 07:49:12

by Pekka Enberg

[permalink] [raw]
Subject: Re: cache_reap(): Further reduction in interrupt holdoff

Hi,

(Christoph, please use [email protected] instead.)

On 3/4/06, Christoph Lameter <[email protected]> wrote:
> cache_reap takes the l3->list_lock (disabling interrupts) unconditionally and
> then does a few checks and maybe does some cleanup. This patch makes
> cache_reap() only take the lock if there is work to do and then the lock
> is taken and released for each cleaning action.
>
> The checking of when to do the next reaping is done without any locking
> and becomes racy. Should not matter since reaping can also be skipped
> if the slab mutex cannot be acquired.

The cache chain mutex is mostly used by /proc/slabinfo and not taken
that often, I think. But yeah, I don't think next reaping is an issue
either.

On 3/4/06, Christoph Lameter <[email protected]> wrote:
> The same is true for the touched processing. If we get this wrong once
> in awhile then we will mistakenly clean or not clean the shared cache.
> This will impact performance slightly.

Touched processing worries me little because it's used when there's
high allocation pressure. Do you have any numbers where this patch
helps?

Pekka

2006-03-06 16:25:30

by Christoph Lameter

[permalink] [raw]
Subject: Re: cache_reap(): Further reduction in interrupt holdoff

On Mon, 6 Mar 2006, Pekka Enberg wrote:

> Touched processing worries me little because it's used when there's
> high allocation pressure. Do you have any numbers where this patch
> helps?

We had frequent holdoffs >30usec in cache_reap() without this patch. Now
it happens once in awhile.