This patch triggers slab defragmentation from memory reclaim.
The logical point for this is after slab shrinking was performed in
vmscan.c. At that point the fragmentation ratio of a slab was increased
because objects were freed via the LRUs. So we call kmem_cache_defrag from
there.
slab_shrink() from vmscan.c is called in some contexts to do
global shrinking of slabs and in others to do shrinking for
a particular zone. Pass the zone to slab_shrink, so that slab_shrink
can call kmem_cache_defrag() and restrict the defragmentation to
the node that is under memory pressure.
Reviewed-by: Rik van Riel <[email protected]>
Signed-off-by: Christoph Lameter <[email protected]>
---
fs/drop_caches.c | 2 +-
include/linux/mm.h | 2 +-
include/linux/slab.h | 1 +
mm/vmscan.c | 26 +++++++++++++++++++-------
4 files changed, 22 insertions(+), 9 deletions(-)
Index: linux-2.6/fs/drop_caches.c
===================================================================
--- linux-2.6.orig/fs/drop_caches.c 2007-08-29 19:30:53.000000000 -0700
+++ linux-2.6/fs/drop_caches.c 2007-11-06 12:53:40.000000000 -0800
@@ -50,7 +50,7 @@ void drop_slab(void)
int nr_objects;
do {
- nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
+ nr_objects = shrink_slab(1000, GFP_KERNEL, 1000, NULL);
} while (nr_objects > 10);
}
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h 2007-11-06 12:33:55.000000000 -0800
+++ linux-2.6/include/linux/mm.h 2007-11-06 12:54:11.000000000 -0800
@@ -1118,7 +1118,7 @@ int in_gate_area_no_task(unsigned long a
int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
- unsigned long lru_pages);
+ unsigned long lru_pages, struct zone *z);
void drop_pagecache(void);
void drop_slab(void);
Index: linux-2.6/include/linux/slab.h
===================================================================
--- linux-2.6.orig/include/linux/slab.h 2007-11-06 12:37:51.000000000 -0800
+++ linux-2.6/include/linux/slab.h 2007-11-06 12:53:40.000000000 -0800
@@ -63,6 +63,7 @@ void kmem_cache_free(struct kmem_cache *
unsigned int kmem_cache_size(struct kmem_cache *);
const char *kmem_cache_name(struct kmem_cache *);
int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr);
+int kmem_cache_defrag(int node);
/*
* Please use this macro to create slab caches. Simply specify the
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c 2007-10-25 18:28:41.000000000 -0700
+++ linux-2.6/mm/vmscan.c 2007-11-06 12:55:25.000000000 -0800
@@ -150,10 +150,18 @@ EXPORT_SYMBOL(unregister_shrinker);
* are eligible for the caller's allocation attempt. It is used for balancing
* slab reclaim versus page reclaim.
*
+ * zone is the zone for which we are shrinking the slabs. If the intent
+ * is to do a global shrink then zone may be NULL. Specification of a
+ * zone is currently only used to limit slab defragmentation to a NUMA node.
+ * The performace of shrink_slab would be better (in particular under NUMA)
+ * if it could be targeted as a whole to the zone that is under memory
+ * pressure but the VFS infrastructure does not allow that at the present
+ * time.
+ *
* Returns the number of slab objects which we shrunk.
*/
unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
- unsigned long lru_pages)
+ unsigned long lru_pages, struct zone *zone)
{
struct shrinker *shrinker;
unsigned long ret = 0;
@@ -210,6 +218,8 @@ unsigned long shrink_slab(unsigned long
shrinker->nr += total_scan;
}
up_read(&shrinker_rwsem);
+ if (gfp_mask & __GFP_FS)
+ kmem_cache_defrag(zone ? zone_to_nid(zone) : -1);
return ret;
}
@@ -1241,7 +1251,7 @@ unsigned long try_to_free_pages(struct z
if (!priority)
disable_swap_token();
nr_reclaimed += shrink_zones(priority, zones, &sc);
- shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
+ shrink_slab(sc.nr_scanned, gfp_mask, lru_pages, NULL);
if (reclaim_state) {
nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
@@ -1419,7 +1429,7 @@ loop_again:
nr_reclaimed += shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
- lru_pages);
+ lru_pages, zone);
nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
if (zone_is_all_unreclaimable(zone))
@@ -1658,7 +1668,7 @@ unsigned long shrink_all_memory(unsigned
/* If slab caches are huge, it's better to hit them first */
while (nr_slab >= lru_pages) {
reclaim_state.reclaimed_slab = 0;
- shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+ shrink_slab(nr_pages, sc.gfp_mask, lru_pages, NULL);
if (!reclaim_state.reclaimed_slab)
break;
@@ -1696,7 +1706,7 @@ unsigned long shrink_all_memory(unsigned
reclaim_state.reclaimed_slab = 0;
shrink_slab(sc.nr_scanned, sc.gfp_mask,
- count_lru_pages());
+ count_lru_pages(), NULL);
ret += reclaim_state.reclaimed_slab;
if (ret >= nr_pages)
goto out;
@@ -1713,7 +1723,8 @@ unsigned long shrink_all_memory(unsigned
if (!ret) {
do {
reclaim_state.reclaimed_slab = 0;
- shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages());
+ shrink_slab(nr_pages, sc.gfp_mask,
+ count_lru_pages(), NULL);
ret += reclaim_state.reclaimed_slab;
} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
}
@@ -1875,7 +1886,8 @@ static int __zone_reclaim(struct zone *z
* Note that shrink_slab will free memory on all zones and may
* take a long time.
*/
- while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
+ while (shrink_slab(sc.nr_scanned, gfp_mask, order,
+ zone) &&
zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
slab_reclaimable - nr_pages)
;
--
Hi Christoph,
On Tue, Nov 06, 2007 at 05:11:42PM -0800, Christoph Lameter wrote:
> Index: linux-2.6/include/linux/slab.h
> ===================================================================
> --- linux-2.6.orig/include/linux/slab.h 2007-11-06 12:37:51.000000000 -0800
> +++ linux-2.6/include/linux/slab.h 2007-11-06 12:53:40.000000000 -0800
> @@ -63,6 +63,7 @@ void kmem_cache_free(struct kmem_cache *
> unsigned int kmem_cache_size(struct kmem_cache *);
> const char *kmem_cache_name(struct kmem_cache *);
> int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr);
> +int kmem_cache_defrag(int node);
The definition in slab.c always returns 0. Wouldn't a static inline
function in the header be better?
> * Returns the number of slab objects which we shrunk.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> */
> unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
> - unsigned long lru_pages)
> + unsigned long lru_pages, struct zone *zone)
> {
> struct shrinker *shrinker;
> unsigned long ret = 0;
> @@ -210,6 +218,8 @@ unsigned long shrink_slab(unsigned long
> shrinker->nr += total_scan;
> }
> up_read(&shrinker_rwsem);
> + if (gfp_mask & __GFP_FS)
> + kmem_cache_defrag(zone ? zone_to_nid(zone) : -1);
> return ret;
> }
What about the objects that kmem_cache_defrag() releases? Shouldn't
they be counted too?
ret += kmem_cache_defrag(...)
Or am I overseeing something here?
Hannes
On Wed, 7 Nov 2007, Johannes Weiner wrote:
> > @@ -210,6 +218,8 @@ unsigned long shrink_slab(unsigned long
> > shrinker->nr += total_scan;
> > }
> > up_read(&shrinker_rwsem);
> > + if (gfp_mask & __GFP_FS)
> > + kmem_cache_defrag(zone ? zone_to_nid(zone) : -1);
> > return ret;
> > }
>
> What about the objects that kmem_cache_defrag() releases? Shouldn't
> they be counted too?
>
> ret += kmem_cache_defrag(...)
>
> Or am I overseeing something here?
kmem_cache_defrag returns the number of pages that were released by defrag
actions.
shrink_slab returns the number of objects released by the shrinkers.
kmem_cache_defrag has no way of knowing how many objects where released by
the kick methods. The kick method may have chosen to reallocate the
object.
On (06/11/07 17:11), Christoph Lameter didst pronounce:
> This patch triggers slab defragmentation from memory reclaim.
> The logical point for this is after slab shrinking was performed in
> vmscan.c. At that point the fragmentation ratio of a slab was increased
> because objects were freed via the LRUs. So we call kmem_cache_defrag from
> there.
>
> slab_shrink() from vmscan.c is called in some contexts to do
> global shrinking of slabs and in others to do shrinking for
> a particular zone. Pass the zone to slab_shrink, so that slab_shrink
> can call kmem_cache_defrag() and restrict the defragmentation to
> the node that is under memory pressure.
>
> Reviewed-by: Rik van Riel <[email protected]>
> Signed-off-by: Christoph Lameter <[email protected]>
> ---
> fs/drop_caches.c | 2 +-
> include/linux/mm.h | 2 +-
> include/linux/slab.h | 1 +
> mm/vmscan.c | 26 +++++++++++++++++++-------
> 4 files changed, 22 insertions(+), 9 deletions(-)
>
> Index: linux-2.6/fs/drop_caches.c
> ===================================================================
> --- linux-2.6.orig/fs/drop_caches.c 2007-08-29 19:30:53.000000000 -0700
> +++ linux-2.6/fs/drop_caches.c 2007-11-06 12:53:40.000000000 -0800
> @@ -50,7 +50,7 @@ void drop_slab(void)
> int nr_objects;
>
> do {
> - nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
> + nr_objects = shrink_slab(1000, GFP_KERNEL, 1000, NULL);
> } while (nr_objects > 10);
> }
>
> Index: linux-2.6/include/linux/mm.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm.h 2007-11-06 12:33:55.000000000 -0800
> +++ linux-2.6/include/linux/mm.h 2007-11-06 12:54:11.000000000 -0800
> @@ -1118,7 +1118,7 @@ int in_gate_area_no_task(unsigned long a
> int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
> void __user *, size_t *, loff_t *);
> unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
> - unsigned long lru_pages);
> + unsigned long lru_pages, struct zone *z);
> void drop_pagecache(void);
> void drop_slab(void);
>
> Index: linux-2.6/include/linux/slab.h
> ===================================================================
> --- linux-2.6.orig/include/linux/slab.h 2007-11-06 12:37:51.000000000 -0800
> +++ linux-2.6/include/linux/slab.h 2007-11-06 12:53:40.000000000 -0800
> @@ -63,6 +63,7 @@ void kmem_cache_free(struct kmem_cache *
> unsigned int kmem_cache_size(struct kmem_cache *);
> const char *kmem_cache_name(struct kmem_cache *);
> int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr);
> +int kmem_cache_defrag(int node);
>
> /*
> * Please use this macro to create slab caches. Simply specify the
> Index: linux-2.6/mm/vmscan.c
> ===================================================================
> --- linux-2.6.orig/mm/vmscan.c 2007-10-25 18:28:41.000000000 -0700
> +++ linux-2.6/mm/vmscan.c 2007-11-06 12:55:25.000000000 -0800
> @@ -150,10 +150,18 @@ EXPORT_SYMBOL(unregister_shrinker);
> * are eligible for the caller's allocation attempt. It is used for balancing
> * slab reclaim versus page reclaim.
> *
> + * zone is the zone for which we are shrinking the slabs. If the intent
> + * is to do a global shrink then zone may be NULL. Specification of a
> + * zone is currently only used to limit slab defragmentation to a NUMA node.
> + * The performace of shrink_slab would be better (in particular under NUMA)
> + * if it could be targeted as a whole to the zone that is under memory
> + * pressure but the VFS infrastructure does not allow that at the present
> + * time.
> + *
> * Returns the number of slab objects which we shrunk.
> */
> unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
> - unsigned long lru_pages)
> + unsigned long lru_pages, struct zone *zone)
> {
> struct shrinker *shrinker;
> unsigned long ret = 0;
> @@ -210,6 +218,8 @@ unsigned long shrink_slab(unsigned long
> shrinker->nr += total_scan;
> }
> up_read(&shrinker_rwsem);
> + if (gfp_mask & __GFP_FS)
> + kmem_cache_defrag(zone ? zone_to_nid(zone) : -1);
Does this make an assumption that only filesystem-related slabs may be
targetted for reclaim? What if there is a slab that can free its objects
without ever caring about a filesystem?
> return ret;
> }
>
> @@ -1241,7 +1251,7 @@ unsigned long try_to_free_pages(struct z
> if (!priority)
> disable_swap_token();
> nr_reclaimed += shrink_zones(priority, zones, &sc);
> - shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
> + shrink_slab(sc.nr_scanned, gfp_mask, lru_pages, NULL);
> if (reclaim_state) {
> nr_reclaimed += reclaim_state->reclaimed_slab;
> reclaim_state->reclaimed_slab = 0;
> @@ -1419,7 +1429,7 @@ loop_again:
> nr_reclaimed += shrink_zone(priority, zone, &sc);
> reclaim_state->reclaimed_slab = 0;
> nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
> - lru_pages);
> + lru_pages, zone);
> nr_reclaimed += reclaim_state->reclaimed_slab;
> total_scanned += sc.nr_scanned;
> if (zone_is_all_unreclaimable(zone))
> @@ -1658,7 +1668,7 @@ unsigned long shrink_all_memory(unsigned
> /* If slab caches are huge, it's better to hit them first */
> while (nr_slab >= lru_pages) {
> reclaim_state.reclaimed_slab = 0;
> - shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
> + shrink_slab(nr_pages, sc.gfp_mask, lru_pages, NULL);
> if (!reclaim_state.reclaimed_slab)
> break;
>
> @@ -1696,7 +1706,7 @@ unsigned long shrink_all_memory(unsigned
>
> reclaim_state.reclaimed_slab = 0;
> shrink_slab(sc.nr_scanned, sc.gfp_mask,
> - count_lru_pages());
> + count_lru_pages(), NULL);
> ret += reclaim_state.reclaimed_slab;
> if (ret >= nr_pages)
> goto out;
> @@ -1713,7 +1723,8 @@ unsigned long shrink_all_memory(unsigned
> if (!ret) {
> do {
> reclaim_state.reclaimed_slab = 0;
> - shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages());
> + shrink_slab(nr_pages, sc.gfp_mask,
> + count_lru_pages(), NULL);
> ret += reclaim_state.reclaimed_slab;
> } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
> }
> @@ -1875,7 +1886,8 @@ static int __zone_reclaim(struct zone *z
> * Note that shrink_slab will free memory on all zones and may
> * take a long time.
> */
> - while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
> + while (shrink_slab(sc.nr_scanned, gfp_mask, order,
> + zone) &&
> zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
> slab_reclaimable - nr_pages)
> ;
>
> --
>
--
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
On Thu, 8 Nov 2007, Mel Gorman wrote:
> > up_read(&shrinker_rwsem);
> > + if (gfp_mask & __GFP_FS)
> > + kmem_cache_defrag(zone ? zone_to_nid(zone) : -1);
>
> Does this make an assumption that only filesystem-related slabs may be
> targetted for reclaim? What if there is a slab that can free its objects
> without ever caring about a filesystem?
Correct. Currently only filesystem related slabs support slab defragy.