2013-04-09 01:20:46

by Joonsoo Kim

[permalink] [raw]
Subject: [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru

In shrink_(in)active_list(), we can fail to put into lru, and these pages
are reclaimed accidentally. Currently, these pages are not counted
for sc->nr_reclaimed, but with this information, we can stop to reclaim
earlier, so can reduce overhead of reclaim.

Signed-off-by: Joonsoo Kim <[email protected]>

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0f615eb..5d60ae0 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -365,7 +365,7 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);
extern void free_hot_cold_page(struct page *page, int cold);
-extern void free_hot_cold_page_list(struct list_head *list, int cold);
+extern unsigned long free_hot_cold_page_list(struct list_head *list, int cold);

extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8fcced7..a5f3952 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1360,14 +1360,18 @@ out:
/*
* Free a list of 0-order pages
*/
-void free_hot_cold_page_list(struct list_head *list, int cold)
+unsigned long free_hot_cold_page_list(struct list_head *list, int cold)
{
+ unsigned long nr_reclaimed = 0;
struct page *page, *next;

list_for_each_entry_safe(page, next, list, lru) {
trace_mm_page_free_batched(page, cold);
free_hot_cold_page(page, cold);
+ nr_reclaimed++;
}
+
+ return nr_reclaimed;
}

/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 88c5fed..eff2927 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -915,7 +915,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
__clear_page_locked(page);
free_it:
- nr_reclaimed++;

/*
* Is there need to periodically free_page_list? It would
@@ -954,7 +953,7 @@ keep:
if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
zone_set_flag(zone, ZONE_CONGESTED);

- free_hot_cold_page_list(&free_pages, 1);
+ nr_reclaimed += free_hot_cold_page_list(&free_pages, 1);

list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
@@ -1321,7 +1320,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
if (nr_taken == 0)
return 0;

- nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
+ nr_reclaimed += shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
&nr_dirty, &nr_writeback, false);

spin_lock_irq(&zone->lru_lock);
@@ -1343,7 +1342,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,

spin_unlock_irq(&zone->lru_lock);

- free_hot_cold_page_list(&page_list, 1);
+ nr_reclaimed += free_hot_cold_page_list(&page_list, 1);

/*
* If reclaim is isolating dirty pages under writeback, it implies
@@ -1438,7 +1437,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
__count_vm_events(PGDEACTIVATE, pgmoved);
}

-static void shrink_active_list(unsigned long nr_to_scan,
+static unsigned long shrink_active_list(unsigned long nr_to_scan,
struct lruvec *lruvec,
struct scan_control *sc,
enum lru_list lru)
@@ -1534,7 +1533,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);

- free_hot_cold_page_list(&l_hold, 1);
+ return free_hot_cold_page_list(&l_hold, 1);
}

#ifdef CONFIG_SWAP
@@ -1617,7 +1616,8 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
{
if (is_active_lru(lru)) {
if (inactive_list_is_low(lruvec, lru))
- shrink_active_list(nr_to_scan, lruvec, sc, lru);
+ return shrink_active_list(nr_to_scan, lruvec, sc, lru);
+
return 0;
}

@@ -1861,8 +1861,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* rebalance the anon lru active/inactive ratio.
*/
if (inactive_anon_is_low(lruvec))
- shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
- sc, LRU_ACTIVE_ANON);
+ sc->nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX,
+ lruvec, sc, LRU_ACTIVE_ANON);

throttle_vm_writeout(sc->gfp_mask);
}
@@ -2470,23 +2470,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
}
#endif

-static void age_active_anon(struct zone *zone, struct scan_control *sc)
+static unsigned long age_active_anon(struct zone *zone,
+ struct scan_control *sc)
{
+ unsigned long nr_reclaimed = 0;
struct mem_cgroup *memcg;

if (!total_swap_pages)
- return;
+ return 0;

memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);

if (inactive_anon_is_low(lruvec))
- shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
- sc, LRU_ACTIVE_ANON);
+ nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX,
+ lruvec, sc, LRU_ACTIVE_ANON);

memcg = mem_cgroup_iter(NULL, memcg, NULL);
} while (memcg);
+
+ return nr_reclaimed;
}

static bool zone_balanced(struct zone *zone, int order,
@@ -2666,7 +2670,7 @@ loop_again:
* Do some background aging of the anon list, to give
* pages a chance to be referenced before reclaiming.
*/
- age_active_anon(zone, &sc);
+ sc.nr_reclaimed += age_active_anon(zone, &sc);

/*
* If the number of buffer_heads in the machine
--
1.7.9.5


2013-04-09 01:20:48

by Joonsoo Kim

[permalink] [raw]
Subject: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab

Currently, freed pages via rcu is not counted for reclaimed_slab, because
it is freed in rcu context, not current task context. But, this free is
initiated by this task, so counting this into this task's reclaimed_slab
is meaningful to decide whether we continue reclaim, or not.
So change code to count these pages for this task's reclaimed_slab.

Cc: Christoph Lameter <[email protected]>
Cc: Pekka Enberg <[email protected]>
Cc: Matt Mackall <[email protected]>
Signed-off-by: Joonsoo Kim <[email protected]>

diff --git a/mm/slub.c b/mm/slub.c
index 4aec537..16fd2d5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1409,8 +1409,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page)

memcg_release_pages(s, order);
page_mapcount_reset(page);
- if (current->reclaim_state)
- current->reclaim_state->reclaimed_slab += pages;
__free_memcg_kmem_pages(page, order);
}

@@ -1431,6 +1429,8 @@ static void rcu_free_slab(struct rcu_head *h)

static void free_slab(struct kmem_cache *s, struct page *page)
{
+ int pages = 1 << compound_order(page);
+
if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
struct rcu_head *head;

@@ -1450,6 +1450,9 @@ static void free_slab(struct kmem_cache *s, struct page *page)
call_rcu(head, rcu_free_slab);
} else
__free_slab(s, page);
+
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += pages;
}

static void discard_slab(struct kmem_cache *s, struct page *page)
--
1.7.9.5

2013-04-09 01:20:57

by Joonsoo Kim

[permalink] [raw]
Subject: [PATCH 3/3] mm, slab: count freed pages via rcu as this task's reclaimed_slab

Currently, freed pages via rcu is not counted for reclaimed_slab, because
it is freed in rcu context, not current task context. But, this free is
initiated by this task, so counting this into this task's reclaimed_slab
is meaningful to decide whether we continue reclaim, or not.
So change code to count these pages for this task's reclaimed_slab.

Cc: Christoph Lameter <[email protected]>
Cc: Pekka Enberg <[email protected]>
Cc: Matt Mackall <[email protected]>
Signed-off-by: Joonsoo Kim <[email protected]>

diff --git a/mm/slab.c b/mm/slab.c
index 856e4a1..4d94bcb 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1934,8 +1934,6 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
}

memcg_release_pages(cachep, cachep->gfporder);
- if (current->reclaim_state)
- current->reclaim_state->reclaimed_slab += nr_freed;
free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
}

@@ -2165,6 +2163,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab
*/
static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
{
+ unsigned long nr_freed = (1 << cachep->gfporder);
void *addr = slabp->s_mem - slabp->colouroff;

slab_destroy_debugcheck(cachep, slabp);
@@ -2180,6 +2179,9 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
if (OFF_SLAB(cachep))
kmem_cache_free(cachep->slabp_cache, slabp);
}
+
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += nr_freed;
}

/**
--
1.7.9.5

2013-04-09 05:55:19

by Minchan Kim

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru

Hello Joonsoo,

On Tue, Apr 09, 2013 at 10:21:16AM +0900, Joonsoo Kim wrote:
> In shrink_(in)active_list(), we can fail to put into lru, and these pages
> are reclaimed accidentally. Currently, these pages are not counted
> for sc->nr_reclaimed, but with this information, we can stop to reclaim
> earlier, so can reduce overhead of reclaim.
>
> Signed-off-by: Joonsoo Kim <[email protected]>

Nice catch!

But this patch handles very corner case and makes reclaim function's name
rather stupid so I'd like to see text size change after we apply this patch.
Other nipicks below.

>
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 0f615eb..5d60ae0 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -365,7 +365,7 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
> extern void __free_pages(struct page *page, unsigned int order);
> extern void free_pages(unsigned long addr, unsigned int order);
> extern void free_hot_cold_page(struct page *page, int cold);
> -extern void free_hot_cold_page_list(struct list_head *list, int cold);
> +extern unsigned long free_hot_cold_page_list(struct list_head *list, int cold);
>
> extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
> extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 8fcced7..a5f3952 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1360,14 +1360,18 @@ out:
> /*
> * Free a list of 0-order pages
> */
> -void free_hot_cold_page_list(struct list_head *list, int cold)
> +unsigned long free_hot_cold_page_list(struct list_head *list, int cold)
> {
> + unsigned long nr_reclaimed = 0;

How about nr_free or nr_freed for consistent with function title?

> struct page *page, *next;
>
> list_for_each_entry_safe(page, next, list, lru) {
> trace_mm_page_free_batched(page, cold);
> free_hot_cold_page(page, cold);
> + nr_reclaimed++;
> }
> +
> + return nr_reclaimed;
> }
>
> /*
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 88c5fed..eff2927 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -915,7 +915,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
> */
> __clear_page_locked(page);
> free_it:
> - nr_reclaimed++;
>
> /*
> * Is there need to periodically free_page_list? It would
> @@ -954,7 +953,7 @@ keep:
> if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
> zone_set_flag(zone, ZONE_CONGESTED);
>
> - free_hot_cold_page_list(&free_pages, 1);
> + nr_reclaimed += free_hot_cold_page_list(&free_pages, 1);

Nice cleanup.

>
> list_splice(&ret_pages, page_list);
> count_vm_events(PGACTIVATE, pgactivate);
> @@ -1321,7 +1320,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
> if (nr_taken == 0)
> return 0;
>
> - nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
> + nr_reclaimed += shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
> &nr_dirty, &nr_writeback, false);

Do you have any reason to change?
To me, '=' is more clear to initialize the variable.
When I see above, I have to look through above lines to catch where code
used the nr_reclaimed.

>
> spin_lock_irq(&zone->lru_lock);
> @@ -1343,7 +1342,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
>
> spin_unlock_irq(&zone->lru_lock);
>
> - free_hot_cold_page_list(&page_list, 1);
> + nr_reclaimed += free_hot_cold_page_list(&page_list, 1);

How about considering vmstat, too?
It could be minor but you are considering freed page as
reclaim context. (ie, sc->nr_reclaimed) so it would be more appropriate.

>
> /*
> * If reclaim is isolating dirty pages under writeback, it implies
> @@ -1438,7 +1437,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
> __count_vm_events(PGDEACTIVATE, pgmoved);
> }
>
> -static void shrink_active_list(unsigned long nr_to_scan,
> +static unsigned long shrink_active_list(unsigned long nr_to_scan,
> struct lruvec *lruvec,
> struct scan_control *sc,
> enum lru_list lru)
> @@ -1534,7 +1533,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
> __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
> spin_unlock_irq(&zone->lru_lock);
>
> - free_hot_cold_page_list(&l_hold, 1);
> + return free_hot_cold_page_list(&l_hold, 1);

It would be better to add comment about return value.
Otherwise, people could confuse with the number of pages moved from
active to inactive.

> }
>
> #ifdef CONFIG_SWAP
> @@ -1617,7 +1616,8 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
> {
> if (is_active_lru(lru)) {
> if (inactive_list_is_low(lruvec, lru))
> - shrink_active_list(nr_to_scan, lruvec, sc, lru);
> + return shrink_active_list(nr_to_scan, lruvec, sc, lru);
> +

Unnecessary change.

> return 0;
> }
>
> @@ -1861,8 +1861,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> * rebalance the anon lru active/inactive ratio.
> */
> if (inactive_anon_is_low(lruvec))
> - shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
> - sc, LRU_ACTIVE_ANON);
> + sc->nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX,
> + lruvec, sc, LRU_ACTIVE_ANON);
>
> throttle_vm_writeout(sc->gfp_mask);
> }
> @@ -2470,23 +2470,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> }
> #endif
>
> -static void age_active_anon(struct zone *zone, struct scan_control *sc)

Comment about return value.
or rename but I have no idea. Sorry.

> +static unsigned long age_active_anon(struct zone *zone,
> + struct scan_control *sc)
> {
> + unsigned long nr_reclaimed = 0;
> struct mem_cgroup *memcg;
>
> if (!total_swap_pages)
> - return;
> + return 0;
>
> memcg = mem_cgroup_iter(NULL, NULL, NULL);
> do {
> struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
>
> if (inactive_anon_is_low(lruvec))
> - shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
> - sc, LRU_ACTIVE_ANON);
> + nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX,
> + lruvec, sc, LRU_ACTIVE_ANON);
>
> memcg = mem_cgroup_iter(NULL, memcg, NULL);
> } while (memcg);
> +
> + return nr_reclaimed;
> }
>
> static bool zone_balanced(struct zone *zone, int order,
> @@ -2666,7 +2670,7 @@ loop_again:
> * Do some background aging of the anon list, to give
> * pages a chance to be referenced before reclaiming.
> */
> - age_active_anon(zone, &sc);
> + sc.nr_reclaimed += age_active_anon(zone, &sc);
>
> /*
> * If the number of buffer_heads in the machine
> --
> 1.7.9.5
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

--
Kind regards,
Minchan Kim

2013-04-09 09:38:38

by Simon Jeons

[permalink] [raw]
Subject: Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab

Hi Joonsoo,
On 04/09/2013 09:21 AM, Joonsoo Kim wrote:
> Currently, freed pages via rcu is not counted for reclaimed_slab, because
> it is freed in rcu context, not current task context. But, this free is
> initiated by this task, so counting this into this task's reclaimed_slab
> is meaningful to decide whether we continue reclaim, or not.
> So change code to count these pages for this task's reclaimed_slab.
>
> Cc: Christoph Lameter <[email protected]>
> Cc: Pekka Enberg <[email protected]>
> Cc: Matt Mackall <[email protected]>
> Signed-off-by: Joonsoo Kim <[email protected]>
>
> diff --git a/mm/slub.c b/mm/slub.c
> index 4aec537..16fd2d5 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1409,8 +1409,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
>
> memcg_release_pages(s, order);
> page_mapcount_reset(page);
> - if (current->reclaim_state)
> - current->reclaim_state->reclaimed_slab += pages;
> __free_memcg_kmem_pages(page, order);
> }
>
> @@ -1431,6 +1429,8 @@ static void rcu_free_slab(struct rcu_head *h)
>
> static void free_slab(struct kmem_cache *s, struct page *page)
> {
> + int pages = 1 << compound_order(page);

One question irrelevant this patch. Why slab cache can use compound
page(hugetlbfs pages/thp pages)? They are just used by app to optimize
tlb miss, is it?

> +
> if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
> struct rcu_head *head;
>
> @@ -1450,6 +1450,9 @@ static void free_slab(struct kmem_cache *s, struct page *page)
> call_rcu(head, rcu_free_slab);
> } else
> __free_slab(s, page);
> +
> + if (current->reclaim_state)
> + current->reclaim_state->reclaimed_slab += pages;
> }
>
> static void discard_slab(struct kmem_cache *s, struct page *page)

Subject: Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab

On Tue, 9 Apr 2013, Joonsoo Kim wrote:

> Currently, freed pages via rcu is not counted for reclaimed_slab, because
> it is freed in rcu context, not current task context. But, this free is
> initiated by this task, so counting this into this task's reclaimed_slab
> is meaningful to decide whether we continue reclaim, or not.
> So change code to count these pages for this task's reclaimed_slab.

slab->reclaim_state guides the reclaim actions in vmscan.c. With this
patch slab->reclaim_state could get quite a high value without new pages being
available for allocation. slab->reclaim_state will only be updated
when the RCU period ends.

Subject: Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab

On Tue, 9 Apr 2013, Simon Jeons wrote:

> > + int pages = 1 << compound_order(page);
>
> One question irrelevant this patch. Why slab cache can use compound
> page(hugetlbfs pages/thp pages)? They are just used by app to optimize tlb
> miss, is it?

Slab caches can use any order pages because these pages are never on
the LRU and are not part of the page cache. Large continuous physical
memory means that objects can be arranged in a more efficient way in the
page. This is particularly useful for larger objects where we might use a
lot of memory because objects do not fit well into a 4k page.

It also reduces the slab page management if higher order pages are used.
In the case of slub the page size also determines the number of objects
that can be allocated/freed without the need for some form of
synchronization.

2013-04-10 03:20:19

by Simon Jeons

[permalink] [raw]
Subject: Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab

Hi Christoph,
On 04/09/2013 10:32 PM, Christoph Lameter wrote:
> On Tue, 9 Apr 2013, Simon Jeons wrote:
>
>>> + int pages = 1 << compound_order(page);
>> One question irrelevant this patch. Why slab cache can use compound
>> page(hugetlbfs pages/thp pages)? They are just used by app to optimize tlb
>> miss, is it?
> Slab caches can use any order pages because these pages are never on
> the LRU and are not part of the page cache. Large continuous physical
> memory means that objects can be arranged in a more efficient way in the
> page. This is particularly useful for larger objects where we might use a
> lot of memory because objects do not fit well into a 4k page.
>
> It also reduces the slab page management if higher order pages are used.
> In the case of slub the page size also determines the number of objects
> that can be allocated/freed without the need for some form of
> synchronization.

It seems that you misunderstand my question. I don't doubt slab/slub can
use high order pages. However, what I focus on is why slab/slub can use
compound page, PageCompound() just on behalf of hugetlbfs pages or thp
pages which should used by apps, isn't it?

>

2013-04-10 05:25:37

by Joonsoo Kim

[permalink] [raw]
Subject: Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab

Hello, Christoph.

On Tue, Apr 09, 2013 at 02:28:06PM +0000, Christoph Lameter wrote:
> On Tue, 9 Apr 2013, Joonsoo Kim wrote:
>
> > Currently, freed pages via rcu is not counted for reclaimed_slab, because
> > it is freed in rcu context, not current task context. But, this free is
> > initiated by this task, so counting this into this task's reclaimed_slab
> > is meaningful to decide whether we continue reclaim, or not.
> > So change code to count these pages for this task's reclaimed_slab.
>
> slab->reclaim_state guides the reclaim actions in vmscan.c. With this
> patch slab->reclaim_state could get quite a high value without new pages being
> available for allocation. slab->reclaim_state will only be updated
> when the RCU period ends.

Okay.

In addition, there is a little place who use SLAB_DESTROY_BY_RCU.
I will drop this patch[2/3] and [3/3] for next spin.

Thanks.

>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

2013-04-10 05:47:43

by Joonsoo Kim

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru

Hello, Minchan.

On Tue, Apr 09, 2013 at 02:55:14PM +0900, Minchan Kim wrote:
> Hello Joonsoo,
>
> On Tue, Apr 09, 2013 at 10:21:16AM +0900, Joonsoo Kim wrote:
> > In shrink_(in)active_list(), we can fail to put into lru, and these pages
> > are reclaimed accidentally. Currently, these pages are not counted
> > for sc->nr_reclaimed, but with this information, we can stop to reclaim
> > earlier, so can reduce overhead of reclaim.
> >
> > Signed-off-by: Joonsoo Kim <[email protected]>
>
> Nice catch!
>
> But this patch handles very corner case and makes reclaim function's name
> rather stupid so I'd like to see text size change after we apply this patch.
> Other nipicks below.

Ah... Yes.
I can re-work it to add number to sc->nr_reclaimed directly for both cases,
shrink_active_list() and age_active_anon().

>
> >
> > diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> > index 0f615eb..5d60ae0 100644
> > --- a/include/linux/gfp.h
> > +++ b/include/linux/gfp.h
> > @@ -365,7 +365,7 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
> > extern void __free_pages(struct page *page, unsigned int order);
> > extern void free_pages(unsigned long addr, unsigned int order);
> > extern void free_hot_cold_page(struct page *page, int cold);
> > -extern void free_hot_cold_page_list(struct list_head *list, int cold);
> > +extern unsigned long free_hot_cold_page_list(struct list_head *list, int cold);
> >
> > extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
> > extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 8fcced7..a5f3952 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -1360,14 +1360,18 @@ out:
> > /*
> > * Free a list of 0-order pages
> > */
> > -void free_hot_cold_page_list(struct list_head *list, int cold)
> > +unsigned long free_hot_cold_page_list(struct list_head *list, int cold)
> > {
> > + unsigned long nr_reclaimed = 0;
>
> How about nr_free or nr_freed for consistent with function title?

Okay.

>
> > struct page *page, *next;
> >
> > list_for_each_entry_safe(page, next, list, lru) {
> > trace_mm_page_free_batched(page, cold);
> > free_hot_cold_page(page, cold);
> > + nr_reclaimed++;
> > }
> > +
> > + return nr_reclaimed;
> > }
> >
> > /*
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 88c5fed..eff2927 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -915,7 +915,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
> > */
> > __clear_page_locked(page);
> > free_it:
> > - nr_reclaimed++;
> >
> > /*
> > * Is there need to periodically free_page_list? It would
> > @@ -954,7 +953,7 @@ keep:
> > if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
> > zone_set_flag(zone, ZONE_CONGESTED);
> >
> > - free_hot_cold_page_list(&free_pages, 1);
> > + nr_reclaimed += free_hot_cold_page_list(&free_pages, 1);
>
> Nice cleanup.
>
> >
> > list_splice(&ret_pages, page_list);
> > count_vm_events(PGACTIVATE, pgactivate);
> > @@ -1321,7 +1320,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
> > if (nr_taken == 0)
> > return 0;
> >
> > - nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
> > + nr_reclaimed += shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
> > &nr_dirty, &nr_writeback, false);
>
> Do you have any reason to change?
> To me, '=' is more clear to initialize the variable.
> When I see above, I have to look through above lines to catch where code
> used the nr_reclaimed.
>

There is no reason, I will change it.

> >
> > spin_lock_irq(&zone->lru_lock);
> > @@ -1343,7 +1342,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
> >
> > spin_unlock_irq(&zone->lru_lock);
> >
> > - free_hot_cold_page_list(&page_list, 1);
> > + nr_reclaimed += free_hot_cold_page_list(&page_list, 1);
>
> How about considering vmstat, too?
> It could be minor but you are considering freed page as
> reclaim context. (ie, sc->nr_reclaimed) so it would be more appropriate.

I don't understand what you mean.
Please explain more what you have in mind :)

>
> >
> > /*
> > * If reclaim is isolating dirty pages under writeback, it implies
> > @@ -1438,7 +1437,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
> > __count_vm_events(PGDEACTIVATE, pgmoved);
> > }
> >
> > -static void shrink_active_list(unsigned long nr_to_scan,
> > +static unsigned long shrink_active_list(unsigned long nr_to_scan,
> > struct lruvec *lruvec,
> > struct scan_control *sc,
> > enum lru_list lru)
> > @@ -1534,7 +1533,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
> > __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
> > spin_unlock_irq(&zone->lru_lock);
> >
> > - free_hot_cold_page_list(&l_hold, 1);
> > + return free_hot_cold_page_list(&l_hold, 1);
>
> It would be better to add comment about return value.
> Otherwise, people could confuse with the number of pages moved from
> active to inactive.
>

In next spin, I will not change return type.
So above problem will be disappreared.

> > }
> >
> > #ifdef CONFIG_SWAP
> > @@ -1617,7 +1616,8 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
> > {
> > if (is_active_lru(lru)) {
> > if (inactive_list_is_low(lruvec, lru))
> > - shrink_active_list(nr_to_scan, lruvec, sc, lru);
> > + return shrink_active_list(nr_to_scan, lruvec, sc, lru);
> > +
>
> Unnecessary change.
>

Why?

> > return 0;
> > }
> >
> > @@ -1861,8 +1861,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> > * rebalance the anon lru active/inactive ratio.
> > */
> > if (inactive_anon_is_low(lruvec))
> > - shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
> > - sc, LRU_ACTIVE_ANON);
> > + sc->nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX,
> > + lruvec, sc, LRU_ACTIVE_ANON);
> >
> > throttle_vm_writeout(sc->gfp_mask);
> > }
> > @@ -2470,23 +2470,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> > }
> > #endif
> >
> > -static void age_active_anon(struct zone *zone, struct scan_control *sc)
>
> Comment about return value.
> or rename but I have no idea. Sorry.

This will be disappreared in next spin.

Thanks for detailed review.

>
> > +static unsigned long age_active_anon(struct zone *zone,
> > + struct scan_control *sc)
> > {
> > + unsigned long nr_reclaimed = 0;
> > struct mem_cgroup *memcg;
> >
> > if (!total_swap_pages)
> > - return;
> > + return 0;
> >
> > memcg = mem_cgroup_iter(NULL, NULL, NULL);
> > do {
> > struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> >
> > if (inactive_anon_is_low(lruvec))
> > - shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
> > - sc, LRU_ACTIVE_ANON);
> > + nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX,
> > + lruvec, sc, LRU_ACTIVE_ANON);
> >
> > memcg = mem_cgroup_iter(NULL, memcg, NULL);
> > } while (memcg);
> > +
> > + return nr_reclaimed;
> > }
> >
> > static bool zone_balanced(struct zone *zone, int order,
> > @@ -2666,7 +2670,7 @@ loop_again:
> > * Do some background aging of the anon list, to give
> > * pages a chance to be referenced before reclaiming.
> > */
> > - age_active_anon(zone, &sc);
> > + sc.nr_reclaimed += age_active_anon(zone, &sc);
> >
> > /*
> > * If the number of buffer_heads in the machine
> > --
> > 1.7.9.5
> >
> > --
> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > the body to [email protected]. For more info on Linux MM,
> > see: http://www.linux-mm.org/ .
> > Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
> --
> Kind regards,
> Minchan Kim
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

2013-04-10 06:03:56

by Minchan Kim

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru

On Wed, Apr 10, 2013 at 02:48:22PM +0900, Joonsoo Kim wrote:
> Hello, Minchan.
>
> On Tue, Apr 09, 2013 at 02:55:14PM +0900, Minchan Kim wrote:
> > Hello Joonsoo,
> >
> > On Tue, Apr 09, 2013 at 10:21:16AM +0900, Joonsoo Kim wrote:
> > > In shrink_(in)active_list(), we can fail to put into lru, and these pages
> > > are reclaimed accidentally. Currently, these pages are not counted
> > > for sc->nr_reclaimed, but with this information, we can stop to reclaim
> > > earlier, so can reduce overhead of reclaim.
> > >
> > > Signed-off-by: Joonsoo Kim <[email protected]>
> >
> > Nice catch!
> >
> > But this patch handles very corner case and makes reclaim function's name
> > rather stupid so I'd like to see text size change after we apply this patch.
> > Other nipicks below.
>
> Ah... Yes.
> I can re-work it to add number to sc->nr_reclaimed directly for both cases,
> shrink_active_list() and age_active_anon().

Sounds better.

>
> >
> > >
> > > diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> > > index 0f615eb..5d60ae0 100644
> > > --- a/include/linux/gfp.h
> > > +++ b/include/linux/gfp.h
> > > @@ -365,7 +365,7 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
> > > extern void __free_pages(struct page *page, unsigned int order);
> > > extern void free_pages(unsigned long addr, unsigned int order);
> > > extern void free_hot_cold_page(struct page *page, int cold);
> > > -extern void free_hot_cold_page_list(struct list_head *list, int cold);
> > > +extern unsigned long free_hot_cold_page_list(struct list_head *list, int cold);
> > >
> > > extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
> > > extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
> > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > index 8fcced7..a5f3952 100644
> > > --- a/mm/page_alloc.c
> > > +++ b/mm/page_alloc.c
> > > @@ -1360,14 +1360,18 @@ out:
> > > /*
> > > * Free a list of 0-order pages
> > > */
> > > -void free_hot_cold_page_list(struct list_head *list, int cold)
> > > +unsigned long free_hot_cold_page_list(struct list_head *list, int cold)
> > > {
> > > + unsigned long nr_reclaimed = 0;
> >
> > How about nr_free or nr_freed for consistent with function title?
>
> Okay.
>
> >
> > > struct page *page, *next;
> > >
> > > list_for_each_entry_safe(page, next, list, lru) {
> > > trace_mm_page_free_batched(page, cold);
> > > free_hot_cold_page(page, cold);
> > > + nr_reclaimed++;
> > > }
> > > +
> > > + return nr_reclaimed;
> > > }
> > >
> > > /*
> > > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > > index 88c5fed..eff2927 100644
> > > --- a/mm/vmscan.c
> > > +++ b/mm/vmscan.c
> > > @@ -915,7 +915,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
> > > */
> > > __clear_page_locked(page);
> > > free_it:
> > > - nr_reclaimed++;
> > >
> > > /*
> > > * Is there need to periodically free_page_list? It would
> > > @@ -954,7 +953,7 @@ keep:
> > > if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
> > > zone_set_flag(zone, ZONE_CONGESTED);
> > >
> > > - free_hot_cold_page_list(&free_pages, 1);
> > > + nr_reclaimed += free_hot_cold_page_list(&free_pages, 1);
> >
> > Nice cleanup.
> >
> > >
> > > list_splice(&ret_pages, page_list);
> > > count_vm_events(PGACTIVATE, pgactivate);
> > > @@ -1321,7 +1320,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
> > > if (nr_taken == 0)
> > > return 0;
> > >
> > > - nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
> > > + nr_reclaimed += shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
> > > &nr_dirty, &nr_writeback, false);
> >
> > Do you have any reason to change?
> > To me, '=' is more clear to initialize the variable.
> > When I see above, I have to look through above lines to catch where code
> > used the nr_reclaimed.
> >
>
> There is no reason, I will change it.
>
> > >
> > > spin_lock_irq(&zone->lru_lock);
> > > @@ -1343,7 +1342,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
> > >
> > > spin_unlock_irq(&zone->lru_lock);
> > >
> > > - free_hot_cold_page_list(&page_list, 1);
> > > + nr_reclaimed += free_hot_cold_page_list(&page_list, 1);
> >
> > How about considering vmstat, too?
> > It could be minor but you are considering freed page as
> > reclaim context. (ie, sc->nr_reclaimed) so it would be more appropriate.
>
> I don't understand what you mean.
> Please explain more what you have in mind :)

We are accouting PGSTEAL_[KSWAPD|DIRECT] with nr_reclaimed and
nr_reclaimed are contributing for sc->nr_reclaimed accumulation but
your patch doesn't consider that so vmstat cound be not exact.
Of course, it's not exact inherently so no big deal that's why I said "minor".
I just want that you think over about that.

Thanks!

>
> >
> > >
> > > /*
> > > * If reclaim is isolating dirty pages under writeback, it implies
> > > @@ -1438,7 +1437,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
> > > __count_vm_events(PGDEACTIVATE, pgmoved);
> > > }
> > >
> > > -static void shrink_active_list(unsigned long nr_to_scan,
> > > +static unsigned long shrink_active_list(unsigned long nr_to_scan,
> > > struct lruvec *lruvec,
> > > struct scan_control *sc,
> > > enum lru_list lru)
> > > @@ -1534,7 +1533,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
> > > __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
> > > spin_unlock_irq(&zone->lru_lock);
> > >
> > > - free_hot_cold_page_list(&l_hold, 1);
> > > + return free_hot_cold_page_list(&l_hold, 1);
> >
> > It would be better to add comment about return value.
> > Otherwise, people could confuse with the number of pages moved from
> > active to inactive.
> >
>
> In next spin, I will not change return type.
> So above problem will be disappreared.
>
> > > }
> > >
> > > #ifdef CONFIG_SWAP
> > > @@ -1617,7 +1616,8 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
> > > {
> > > if (is_active_lru(lru)) {
> > > if (inactive_list_is_low(lruvec, lru))
> > > - shrink_active_list(nr_to_scan, lruvec, sc, lru);
> > > + return shrink_active_list(nr_to_scan, lruvec, sc, lru);
> > > +
> >
> > Unnecessary change.
> >
>
> Why?

You are adding unnecessary newline.

>
> > > return 0;
> > > }
> > >
> > > @@ -1861,8 +1861,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> > > * rebalance the anon lru active/inactive ratio.
> > > */
> > > if (inactive_anon_is_low(lruvec))
> > > - shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
> > > - sc, LRU_ACTIVE_ANON);
> > > + sc->nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX,
> > > + lruvec, sc, LRU_ACTIVE_ANON);
> > >
> > > throttle_vm_writeout(sc->gfp_mask);
> > > }
> > > @@ -2470,23 +2470,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> > > }
> > > #endif
> > >
> > > -static void age_active_anon(struct zone *zone, struct scan_control *sc)
> >
> > Comment about return value.
> > or rename but I have no idea. Sorry.
>
> This will be disappreared in next spin.
>
> Thanks for detailed review.
>
> >
> > > +static unsigned long age_active_anon(struct zone *zone,
> > > + struct scan_control *sc)
> > > {
> > > + unsigned long nr_reclaimed = 0;
> > > struct mem_cgroup *memcg;
> > >
> > > if (!total_swap_pages)
> > > - return;
> > > + return 0;
> > >
> > > memcg = mem_cgroup_iter(NULL, NULL, NULL);
> > > do {
> > > struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> > >
> > > if (inactive_anon_is_low(lruvec))
> > > - shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
> > > - sc, LRU_ACTIVE_ANON);
> > > + nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX,
> > > + lruvec, sc, LRU_ACTIVE_ANON);
> > >
> > > memcg = mem_cgroup_iter(NULL, memcg, NULL);
> > > } while (memcg);
> > > +
> > > + return nr_reclaimed;
> > > }
> > >
> > > static bool zone_balanced(struct zone *zone, int order,
> > > @@ -2666,7 +2670,7 @@ loop_again:
> > > * Do some background aging of the anon list, to give
> > > * pages a chance to be referenced before reclaiming.
> > > */
> > > - age_active_anon(zone, &sc);
> > > + sc.nr_reclaimed += age_active_anon(zone, &sc);
> > >
> > > /*
> > > * If the number of buffer_heads in the machine
> > > --
> > > 1.7.9.5
> > >
> > > --
> > > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > > the body to [email protected]. For more info on Linux MM,
> > > see: http://www.linux-mm.org/ .
> > > Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
> >
> > --
> > Kind regards,
> > Minchan Kim
> >
> > --
> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > the body to [email protected]. For more info on Linux MM,
> > see: http://www.linux-mm.org/ .
> > Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

--
Kind regards,
Minchan Kim

Subject: Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab

On Wed, 10 Apr 2013, Simon Jeons wrote:

> It seems that you misunderstand my question. I don't doubt slab/slub can use
> high order pages. However, what I focus on is why slab/slub can use compound
> page, PageCompound() just on behalf of hugetlbfs pages or thp pages which
> should used by apps, isn't it?

I am not entirely clear on what you are asking for. The following gives a
couple of answers to what I guess the question was.

THP pages and user pages are on the lru and are managed differently.
The slab allocators cannot work with those pages.

Slab allocators *can* allocate higher order pages therefore they could
allocate a page of the same order as huge pages and manage it that way.

However there is no way that these pages could be handled like THP pages
since they cannot be broken up (unless we add the capability to move slab
objects which I wanted to do for a long time).


You can boot a Linux system that uses huge pages for slab allocation
by specifying the following parameter on the kernel command line.

slub_min_order=9

The slub allocator will start using huge pages for all its storage
needs. You need a large number of huge pages to do this. Lots of memory
is going to be lost due to fragmentation but its going to be fast since
the slowpaths are rarely used. OOMs due to reclaim failure become much
more likely ;-).

Subject: Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab

On Wed, 10 Apr 2013, Joonsoo Kim wrote:

> Hello, Christoph.
>
> On Tue, Apr 09, 2013 at 02:28:06PM +0000, Christoph Lameter wrote:
> > On Tue, 9 Apr 2013, Joonsoo Kim wrote:
> >
> > > Currently, freed pages via rcu is not counted for reclaimed_slab, because
> > > it is freed in rcu context, not current task context. But, this free is
> > > initiated by this task, so counting this into this task's reclaimed_slab
> > > is meaningful to decide whether we continue reclaim, or not.
> > > So change code to count these pages for this task's reclaimed_slab.
> >
> > slab->reclaim_state guides the reclaim actions in vmscan.c. With this
> > patch slab->reclaim_state could get quite a high value without new pages being
> > available for allocation. slab->reclaim_state will only be updated
> > when the RCU period ends.
>
> Okay.
>
> In addition, there is a little place who use SLAB_DESTROY_BY_RCU.
> I will drop this patch[2/3] and [3/3] for next spin.

What you have discoverd is an issue that we have so far overlooked. Could
you add comments to both places explaining the situation? RCU is used for
some inode and the dentry cache. Failing to account for these frees could
pose a problem. One solution would be to ensure that we get through an RCU
quiescent period in the slabs reclaim. If we can ensure that then your
patch may be ok.

2013-04-10 14:24:41

by Joonsoo Kim

[permalink] [raw]
Subject: Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab

2013/4/10 Christoph Lameter <[email protected]>:
> On Wed, 10 Apr 2013, Joonsoo Kim wrote:
>
>> Hello, Christoph.
>>
>> On Tue, Apr 09, 2013 at 02:28:06PM +0000, Christoph Lameter wrote:
>> > On Tue, 9 Apr 2013, Joonsoo Kim wrote:
>> >
>> > > Currently, freed pages via rcu is not counted for reclaimed_slab, because
>> > > it is freed in rcu context, not current task context. But, this free is
>> > > initiated by this task, so counting this into this task's reclaimed_slab
>> > > is meaningful to decide whether we continue reclaim, or not.
>> > > So change code to count these pages for this task's reclaimed_slab.
>> >
>> > slab->reclaim_state guides the reclaim actions in vmscan.c. With this
>> > patch slab->reclaim_state could get quite a high value without new pages being
>> > available for allocation. slab->reclaim_state will only be updated
>> > when the RCU period ends.
>>
>> Okay.
>>
>> In addition, there is a little place who use SLAB_DESTROY_BY_RCU.
>> I will drop this patch[2/3] and [3/3] for next spin.
>
> What you have discoverd is an issue that we have so far overlooked. Could
> you add comments to both places explaining the situation?

Yes, I can.

> RCU is used for
> some inode and the dentry cache. Failing to account for these frees could
> pose a problem. One solution would be to ensure that we get through an RCU
> quiescent period in the slabs reclaim. If we can ensure that then your
> patch may be ok.

Hmm... I don't perfectly understand RCU code and it's quiescent
period. But, yes, it
can be one of possible solutions in my quick thought. Currently, I
have no ability to
do that, so I skip to think about this.

Thanks.

>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

2013-04-11 03:46:34

by Simon Jeons

[permalink] [raw]
Subject: Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab

Hi Christoph,
On 04/10/2013 09:54 PM, Christoph Lameter wrote:
> On Wed, 10 Apr 2013, Simon Jeons wrote:
>
>> It seems that you misunderstand my question. I don't doubt slab/slub can use
>> high order pages. However, what I focus on is why slab/slub can use compound
>> page, PageCompound() just on behalf of hugetlbfs pages or thp pages which
>> should used by apps, isn't it?
> I am not entirely clear on what you are asking for. The following gives a
> couple of answers to what I guess the question was.
>
> THP pages and user pages are on the lru and are managed differently.
> The slab allocators cannot work with those pages.
>
> Slab allocators *can* allocate higher order pages therefore they could
> allocate a page of the same order as huge pages and manage it that way.
>
> However there is no way that these pages could be handled like THP pages
> since they cannot be broken up (unless we add the capability to move slab
> objects which I wanted to do for a long time).
>
>
> You can boot a Linux system that uses huge pages for slab allocation
> by specifying the following parameter on the kernel command line.
>
> slub_min_order=9
>
> The slub allocator will start using huge pages for all its storage
> needs. You need a large number of huge pages to do this. Lots of memory
> is going to be lost due to fragmentation but its going to be fast since
> the slowpaths are rarely used. OOMs due to reclaim failure become much
> more likely ;-).
>

It seems that I need to simple my question.
All pages which order >=1 are compound pages?

Subject: Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab

On Thu, 11 Apr 2013, Simon Jeons wrote:

> It seems that I need to simple my question.
> All pages which order >=1 are compound pages?

In the slub allocator that is true.

One can request and free a series of contiguous pages that are not
compound pages from the page allocator and a couple of subsystems use
those.