2014-06-17 04:57:59

by Chen Yucong

[permalink] [raw]
Subject: [PATCH] mm/vmscan.c: fix an implementation flaw in proportional scanning

Via https://lkml.org/lkml/2013/4/10/897, we can know that the relative design
idea is to keep

scan_target[anon] : scan_target[file]
== really_scanned_num[anon] : really_scanned_num[file]

But we can find the following snippet in shrink_lruvec():

if (nr_file > nr_anon) {
...
} else {
...
}

However, the above code fragment broke the design idea. We can assume:

nr[LRU_ACTIVE_FILE] = 30
nr[LRU_INACTIVE_FILE] = 30
nr[LRU_ACTIVE_ANON] = 0
nr[LRU_INACTIVE_ANON] = 40

When the value of (nr_reclaimed < nr_to_reclaim) become false, there are
the following results:

nr[LRU_ACTIVE_FILE] = 15
nr[LRU_INACTIVE_FILE] = 15
nr[LRU_ACTIVE_ANON] = 0
nr[LRU_INACTIVE_ANON] = 25
nr_file = 30
nr_anon = 25
file_percent = 30 / 60 = 0.5
anon_percent = 25 / 40 = 0.65

According to the above design idea, we should scan some pages from ANON,
but in fact we execute the an error code path due to "if (nr_file > nr_anon)".
In this way, nr[lru] is likely to be a negative number. Luckily,
"nr[lru] -= min(nr[lru], nr_scanned)" can help us to filter this situation,
but it has rebelled against our design idea.

Signed-off-by: Chen Yucong <[email protected]>
---
mm/vmscan.c | 39 ++++++++++++++++++---------------------
1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index a8ffe4e..2c35e34 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2057,8 +2057,7 @@ out:
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
- unsigned long targets[NR_LRU_LISTS];
- unsigned long nr_to_scan;
+ unsigned long file_target, anon_target;
enum lru_list lru;
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
@@ -2067,8 +2066,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)

get_scan_count(lruvec, sc, nr);

- /* Record the original scan target for proportional adjustments later */
- memcpy(targets, nr, sizeof(nr));
+ file_target = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
+ anon_target = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];

/*
* Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
@@ -2087,8 +2086,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
- unsigned long nr_anon, nr_file, percentage;
- unsigned long nr_scanned;
+ unsigned long nr_anon, nr_file, file_percent, anon_percent;
+ unsigned long nr_to_scan, nr_scanned, percentage;

for_each_evictable_lru(lru) {
if (nr[lru]) {
@@ -2122,16 +2121,19 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
if (!nr_file || !nr_anon)
break;

- if (nr_file > nr_anon) {
- unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
- targets[LRU_ACTIVE_ANON] + 1;
+ file_percent = nr_file * 100 / file_target;
+ anon_percent = nr_anon * 100 / anon_target;
+
+ if (file_percent > anon_percent) {
lru = LRU_BASE;
- percentage = nr_anon * 100 / scan_target;
+ nr_scanned = file_target - nr_file;
+ nr_to_scan = file_target * (100 - anon_percent) / 100;
+ percentage = nr[LRU_FILE] * 100 / nr_file;
} else {
- unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
- targets[LRU_ACTIVE_FILE] + 1;
lru = LRU_FILE;
- percentage = nr_file * 100 / scan_target;
+ nr_scanned = anon_target - nr_anon;
+ nr_to_scan = anon_target * (100 - file_percent) / 100;
+ percentage = nr[LRU_BASE] * 100 / nr_anon;
}

/* Stop scanning the smaller of the LRU */
@@ -2143,14 +2145,9 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* scan target and the percentage scanning already complete
*/
lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
- nr_scanned = targets[lru] - nr[lru];
- nr[lru] = targets[lru] * (100 - percentage) / 100;
- nr[lru] -= min(nr[lru], nr_scanned);
-
- lru += LRU_ACTIVE;
- nr_scanned = targets[lru] - nr[lru];
- nr[lru] = targets[lru] * (100 - percentage) / 100;
- nr[lru] -= min(nr[lru], nr_scanned);
+ nr_to_scan -= min(nr_to_scan, nr_scanned);
+ nr[lru] = nr_to_scan * percentage / 100;
+ nr[lru + LRU_ACTIVE] = nr_to_scan - nr[lru];

scan_adjusted = true;
}
--
1.7.10.4


2014-06-18 09:01:15

by Jerome Marchand

[permalink] [raw]
Subject: Re: [PATCH] mm/vmscan.c: fix an implementation flaw in proportional scanning

On 06/17/2014 06:55 AM, Chen Yucong wrote:
> Via https://lkml.org/lkml/2013/4/10/897, we can know that the relative design
> idea is to keep
>
> scan_target[anon] : scan_target[file]
> == really_scanned_num[anon] : really_scanned_num[file]
>
> But we can find the following snippet in shrink_lruvec():
>
> if (nr_file > nr_anon) {
> ...
> } else {
> ...
> }
>
> However, the above code fragment broke the design idea. We can assume:
>
> nr[LRU_ACTIVE_FILE] = 30
> nr[LRU_INACTIVE_FILE] = 30
> nr[LRU_ACTIVE_ANON] = 0
> nr[LRU_INACTIVE_ANON] = 40
>
> When the value of (nr_reclaimed < nr_to_reclaim) become false, there are
> the following results:
>
> nr[LRU_ACTIVE_FILE] = 15
> nr[LRU_INACTIVE_FILE] = 15
> nr[LRU_ACTIVE_ANON] = 0
> nr[LRU_INACTIVE_ANON] = 25
> nr_file = 30
> nr_anon = 25
> file_percent = 30 / 60 = 0.5
> anon_percent = 25 / 40 = 0.65
>
> According to the above design idea, we should scan some pages from ANON,
> but in fact we execute the an error code path due to "if (nr_file > nr_anon)".
> In this way, nr[lru] is likely to be a negative number. Luckily,
> "nr[lru] -= min(nr[lru], nr_scanned)" can help us to filter this situation,
> but it has rebelled against our design idea.
>
> Signed-off-by: Chen Yucong <[email protected]>
> ---
> mm/vmscan.c | 39 ++++++++++++++++++---------------------
> 1 file changed, 18 insertions(+), 21 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index a8ffe4e..2c35e34 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2057,8 +2057,7 @@ out:
> static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> {
> unsigned long nr[NR_LRU_LISTS];
> - unsigned long targets[NR_LRU_LISTS];
> - unsigned long nr_to_scan;
> + unsigned long file_target, anon_target;
> enum lru_list lru;
> unsigned long nr_reclaimed = 0;
> unsigned long nr_to_reclaim = sc->nr_to_reclaim;
> @@ -2067,8 +2066,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
>
> get_scan_count(lruvec, sc, nr);
>
> - /* Record the original scan target for proportional adjustments later */
> - memcpy(targets, nr, sizeof(nr));
> + file_target = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
> + anon_target = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];

Current code adds 1 to these value to avoid divide by zero error.

>
> /*
> * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
> @@ -2087,8 +2086,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> blk_start_plug(&plug);
> while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
> nr[LRU_INACTIVE_FILE]) {
> - unsigned long nr_anon, nr_file, percentage;
> - unsigned long nr_scanned;
> + unsigned long nr_anon, nr_file, file_percent, anon_percent;
> + unsigned long nr_to_scan, nr_scanned, percentage;
>
> for_each_evictable_lru(lru) {
> if (nr[lru]) {
> @@ -2122,16 +2121,19 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> if (!nr_file || !nr_anon)
> break;
>
> - if (nr_file > nr_anon) {
> - unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
> - targets[LRU_ACTIVE_ANON] + 1;
> + file_percent = nr_file * 100 / file_target;
> + anon_percent = nr_anon * 100 / anon_target;

Here it could happen.

Jerome

> +
> + if (file_percent > anon_percent) {
> lru = LRU_BASE;
> - percentage = nr_anon * 100 / scan_target;
> + nr_scanned = file_target - nr_file;
> + nr_to_scan = file_target * (100 - anon_percent) / 100;
> + percentage = nr[LRU_FILE] * 100 / nr_file;
> } else {
> - unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
> - targets[LRU_ACTIVE_FILE] + 1;
> lru = LRU_FILE;
> - percentage = nr_file * 100 / scan_target;
> + nr_scanned = anon_target - nr_anon;
> + nr_to_scan = anon_target * (100 - file_percent) / 100;
> + percentage = nr[LRU_BASE] * 100 / nr_anon;
> }
>
> /* Stop scanning the smaller of the LRU */
> @@ -2143,14 +2145,9 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> * scan target and the percentage scanning already complete
> */
> lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
> - nr_scanned = targets[lru] - nr[lru];
> - nr[lru] = targets[lru] * (100 - percentage) / 100;
> - nr[lru] -= min(nr[lru], nr_scanned);
> -
> - lru += LRU_ACTIVE;
> - nr_scanned = targets[lru] - nr[lru];
> - nr[lru] = targets[lru] * (100 - percentage) / 100;
> - nr[lru] -= min(nr[lru], nr_scanned);
> + nr_to_scan -= min(nr_to_scan, nr_scanned);
> + nr[lru] = nr_to_scan * percentage / 100;
> + nr[lru + LRU_ACTIVE] = nr_to_scan - nr[lru];
>
> scan_adjusted = true;
> }
>

2014-06-18 09:12:06

by Chen Yucong

[permalink] [raw]
Subject: Re: [PATCH] mm/vmscan.c: fix an implementation flaw in proportional scanning

On Wed, 2014-06-18 at 11:00 +0200, Jerome Marchand wrote:
> > if (!nr_file || !nr_anon)
> > break;
> >
> > - if (nr_file > nr_anon) {
> > - unsigned long scan_target =
> targets[LRU_INACTIVE_ANON] +
> >
> - targets[LRU_ACTIVE_ANON]
> + 1;
> > + file_percent = nr_file * 100 / file_target;
> > + anon_percent = nr_anon * 100 / anon_target;
>
> Here it could happen.
>
>
The snippet
...
if (!nr_file || !nr_anon)
break;
...
can help us to filter the situation which you have described. It comes
from Mel's patch that is called:

mm: vmscan: use proportional scanning during direct reclaim and full
scan at DEF_PRIORITY

thx!
cyc

2014-06-18 09:54:27

by Jerome Marchand

[permalink] [raw]
Subject: Re: [PATCH] mm/vmscan.c: fix an implementation flaw in proportional scanning

On 06/18/2014 11:08 AM, Chen Yucong wrote:
> On Wed, 2014-06-18 at 11:00 +0200, Jerome Marchand wrote:
>>> if (!nr_file || !nr_anon)
>>> break;
>>>
>>> - if (nr_file > nr_anon) {
>>> - unsigned long scan_target =
>> targets[LRU_INACTIVE_ANON] +
>>>
>> - targets[LRU_ACTIVE_ANON]
>> + 1;
>>> + file_percent = nr_file * 100 / file_target;
>>> + anon_percent = nr_anon * 100 / anon_target;
>>
>> Here it could happen.
>>
>>
> The snippet
> ...
> if (!nr_file || !nr_anon)
> break;

Looks like nr[] values can only decrease and stay positive. Then the
following should be true at all times:

file_target >= nr_file >= 0
anon_target >= nr_anon >= 0

and the code above should indeed avoid the divide by zero.

Thanks,
Jerome

> ...
> can help us to filter the situation which you have described. It comes
> from Mel's patch that is called:
>
> mm: vmscan: use proportional scanning during direct reclaim and full
> scan at DEF_PRIORITY
>
> thx!
> cyc
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>

2014-06-18 22:27:55

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] mm/vmscan.c: fix an implementation flaw in proportional scanning

On Tue, 17 Jun 2014 12:55:02 +0800 Chen Yucong <[email protected]> wrote:

> Via https://lkml.org/lkml/2013/4/10/897, we can know that the relative design
> idea is to keep
>
> scan_target[anon] : scan_target[file]
> == really_scanned_num[anon] : really_scanned_num[file]
>
> But we can find the following snippet in shrink_lruvec():
>
> if (nr_file > nr_anon) {
> ...
> } else {
> ...
> }
>
> However, the above code fragment broke the design idea. We can assume:
>
> nr[LRU_ACTIVE_FILE] = 30
> nr[LRU_INACTIVE_FILE] = 30
> nr[LRU_ACTIVE_ANON] = 0
> nr[LRU_INACTIVE_ANON] = 40
>
> When the value of (nr_reclaimed < nr_to_reclaim) become false, there are
> the following results:
>
> nr[LRU_ACTIVE_FILE] = 15
> nr[LRU_INACTIVE_FILE] = 15
> nr[LRU_ACTIVE_ANON] = 0
> nr[LRU_INACTIVE_ANON] = 25
> nr_file = 30
> nr_anon = 25
> file_percent = 30 / 60 = 0.5
> anon_percent = 25 / 40 = 0.65
>
> According to the above design idea, we should scan some pages from ANON,
> but in fact we execute the an error code path due to "if (nr_file > nr_anon)".
> In this way, nr[lru] is likely to be a negative number. Luckily,
> "nr[lru] -= min(nr[lru], nr_scanned)" can help us to filter this situation,
> but it has rebelled against our design idea.

Mel, could you please pencil in some time to look at this one?

Perhaps before doing that you could suggest what sort of testing might
help us understand any runtime effects from this fix.

> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index a8ffe4e..2c35e34 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2087,8 +2086,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> blk_start_plug(&plug);
> while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
> nr[LRU_INACTIVE_FILE]) {
> - unsigned long nr_anon, nr_file, percentage;
> - unsigned long nr_scanned;
> + unsigned long nr_anon, nr_file, file_percent, anon_percent;
> + unsigned long nr_to_scan, nr_scanned, percentage;
>
> for_each_evictable_lru(lru) {
> if (nr[lru]) {

The increased stack use is a slight concern - we can be very deep here.
I suspect the "percent" locals are more for convenience/clarity, and
they could be eliminated (in a separate patch) at some cost of clarity?

2014-06-19 00:07:15

by Chen Yucong

[permalink] [raw]
Subject: Re: [PATCH] mm/vmscan.c: fix an implementation flaw in proportional scanning

On Wed, 2014-06-18 at 15:27 -0700, Andrew Morton wrote:
> On Tue, 17 Jun 2014 12:55:02 +0800 Chen Yucong <[email protected]> wrote:
>
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index a8ffe4e..2c35e34 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -2087,8 +2086,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> > blk_start_plug(&plug);
> > while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
> > nr[LRU_INACTIVE_FILE]) {
> > - unsigned long nr_anon, nr_file, percentage;
> > - unsigned long nr_scanned;
> > + unsigned long nr_anon, nr_file, file_percent, anon_percent;
> > + unsigned long nr_to_scan, nr_scanned, percentage;
> >
> > for_each_evictable_lru(lru) {
> > if (nr[lru]) {
>
> The increased stack use is a slight concern - we can be very deep here.
> I suspect the "percent" locals are more for convenience/clarity, and
> they could be eliminated (in a separate patch) at some cost of clarity?
>
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a8ffe4e..2c35e34 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2057,8 +2057,7 @@ out:
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control
*sc)
{
unsigned long nr[NR_LRU_LISTS];
- unsigned long targets[NR_LRU_LISTS];
- unsigned long nr_to_scan;
+ unsigned long file_target, anon_target;

>From the above snippet, we can know that the "percent" locals come from
targets[NR_LRU_LISTS]. So this fix does not increase the stack.

thx!
cyc

2014-06-19 00:42:23

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] mm/vmscan.c: fix an implementation flaw in proportional scanning

On Thu, 19 Jun 2014 08:04:32 +0800 Chen Yucong <[email protected]> wrote:

> On Wed, 2014-06-18 at 15:27 -0700, Andrew Morton wrote:
> > On Tue, 17 Jun 2014 12:55:02 +0800 Chen Yucong <[email protected]> wrote:
> >
> > > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > > index a8ffe4e..2c35e34 100644
> > > --- a/mm/vmscan.c
> > > +++ b/mm/vmscan.c
> > > @@ -2087,8 +2086,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> > > blk_start_plug(&plug);
> > > while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
> > > nr[LRU_INACTIVE_FILE]) {
> > > - unsigned long nr_anon, nr_file, percentage;
> > > - unsigned long nr_scanned;
> > > + unsigned long nr_anon, nr_file, file_percent, anon_percent;
> > > + unsigned long nr_to_scan, nr_scanned, percentage;
> > >
> > > for_each_evictable_lru(lru) {
> > > if (nr[lru]) {
> >
> > The increased stack use is a slight concern - we can be very deep here.
> > I suspect the "percent" locals are more for convenience/clarity, and
> > they could be eliminated (in a separate patch) at some cost of clarity?
> >
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index a8ffe4e..2c35e34 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2057,8 +2057,7 @@ out:
> static void shrink_lruvec(struct lruvec *lruvec, struct scan_control
> *sc)
> {
> unsigned long nr[NR_LRU_LISTS];
> - unsigned long targets[NR_LRU_LISTS];
> - unsigned long nr_to_scan;
> + unsigned long file_target, anon_target;
>
> >From the above snippet, we can know that the "percent" locals come from
> targets[NR_LRU_LISTS]. So this fix does not increase the stack.

OK. But I expect the stack use could be decreased by using more
complex expressions.

2014-06-19 01:02:07

by Minchan Kim

[permalink] [raw]
Subject: Re: [PATCH] mm/vmscan.c: fix an implementation flaw in proportional scanning

Hello Andrew,

On Wed, Jun 18, 2014 at 05:40:01PM -0700, Andrew Morton wrote:
> On Thu, 19 Jun 2014 08:04:32 +0800 Chen Yucong <[email protected]> wrote:
>
> > On Wed, 2014-06-18 at 15:27 -0700, Andrew Morton wrote:
> > > On Tue, 17 Jun 2014 12:55:02 +0800 Chen Yucong <[email protected]> wrote:
> > >
> > > > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > > > index a8ffe4e..2c35e34 100644
> > > > --- a/mm/vmscan.c
> > > > +++ b/mm/vmscan.c
> > > > @@ -2087,8 +2086,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> > > > blk_start_plug(&plug);
> > > > while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
> > > > nr[LRU_INACTIVE_FILE]) {
> > > > - unsigned long nr_anon, nr_file, percentage;
> > > > - unsigned long nr_scanned;
> > > > + unsigned long nr_anon, nr_file, file_percent, anon_percent;
> > > > + unsigned long nr_to_scan, nr_scanned, percentage;
> > > >
> > > > for_each_evictable_lru(lru) {
> > > > if (nr[lru]) {
> > >
> > > The increased stack use is a slight concern - we can be very deep here.
> > > I suspect the "percent" locals are more for convenience/clarity, and
> > > they could be eliminated (in a separate patch) at some cost of clarity?
> > >
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index a8ffe4e..2c35e34 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -2057,8 +2057,7 @@ out:
> > static void shrink_lruvec(struct lruvec *lruvec, struct scan_control
> > *sc)
> > {
> > unsigned long nr[NR_LRU_LISTS];
> > - unsigned long targets[NR_LRU_LISTS];
> > - unsigned long nr_to_scan;
> > + unsigned long file_target, anon_target;
> >
> > >From the above snippet, we can know that the "percent" locals come from
> > targets[NR_LRU_LISTS]. So this fix does not increase the stack.
>
> OK. But I expect the stack use could be decreased by using more
> complex expressions.

I didn't look at this patch yet but want to say.

The expression is not easy to follow since several people already
confused/discuss/fixed a bit so I'd like to put more concern to clarity
rather than stack footprint. I'm not saying stack footprint is not
important but I'd like to remain it last resort.
That's why I posted below for clarity.
https://lkml.org/lkml/2014/6/16/750

If we really want to reduce stack, we could do a little bit by below.

My 2 cents

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9b61b9bf81ac..ddae227fd1ec 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -164,13 +164,15 @@ enum lru_list {
LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
+ NR_EVICTABLE_LRU_LISTS = LRU_UNEVICTABLE,
LRU_UNEVICTABLE,
NR_LRU_LISTS
};

#define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)

-#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
+#define for_each_evictable_lru(lru) for (lru = 0; \
+ lru <= NR_EVICTABLE_LRU_LISTS; lru++)

static inline int is_file_lru(enum lru_list lru)
{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a9c74b409681..11f57a017131 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2012,8 +2012,8 @@ out:
*/
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
- unsigned long nr[NR_LRU_LISTS];
- unsigned long targets[NR_LRU_LISTS];
+ unsigned long nr[NR_EVICTABLE_LRU_LISTS];
+ unsigned long targets[NR_EVICTABLE_LRU_LISTS];
unsigned long nr_to_scan;
enum lru_list lru;
unsigned long nr_reclaimed = 0;
--
Kind regards,
Minchan Kim

2014-06-19 20:13:27

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] mm/vmscan.c: fix an implementation flaw in proportional scanning

On Thu, 19 Jun 2014 10:02:39 +0900 Minchan Kim <[email protected]> wrote:

> > > @@ -2057,8 +2057,7 @@ out:
> > > static void shrink_lruvec(struct lruvec *lruvec, struct scan_control
> > > *sc)
> > > {
> > > unsigned long nr[NR_LRU_LISTS];
> > > - unsigned long targets[NR_LRU_LISTS];
> > > - unsigned long nr_to_scan;
> > > + unsigned long file_target, anon_target;
> > >
> > > >From the above snippet, we can know that the "percent" locals come from
> > > targets[NR_LRU_LISTS]. So this fix does not increase the stack.
> >
> > OK. But I expect the stack use could be decreased by using more
> > complex expressions.
>
> I didn't look at this patch yet but want to say.
>
> The expression is not easy to follow since several people already
> confused/discuss/fixed a bit so I'd like to put more concern to clarity
> rather than stack footprint.

That code is absolutely awful :( It's terribly difficult to work out
what the design is - what the code is actually setting out to achieve.
One is reduced to trying to reverse-engineer the intent from the
implementation and that becomes near impossible when the
implementation has bugs!

Look at this miserable comment:

/*
* For kswapd and memcg, reclaim at least the number of pages
* requested. Ensure that the anon and file LRUs are scanned
* proportionally what was requested by get_scan_count(). We
* stop reclaiming one LRU and reduce the amount scanning
* proportional to the original scan target.
*/


> For kswapd and memcg, reclaim at least the number of pages
> requested.

*why*?

> Ensure that the anon and file LRUs are scanned
> proportionally what was requested by get_scan_count().

Ungramattical. Lacks specificity. Fails to explain *why*.

> We stop reclaiming one LRU and reduce the amount scanning
> proportional to the original scan target.

Ungramattical. Lacks specificity. Fails to explain *why*.


The only way we're going to fix all this up is to stop looking at the
code altogether. Write down the design and the intentions in English.
Review that. Then implement that design in C.

So review and understanding of this code then is a two-stage thing.
First, we review and understand the *design*, as written in English.
Secondly, we check that the code faithfully implements that design.
This second step becomes quite trivial.


That may all sound excessively long-winded and formal, but
shrink_lruvec() of all places needs such treatment. I am completely
fed up with peering at the code trying to work out what on earth people
were thinking when they typed it in :(


So my suggestion is: let's stop fiddling with the code. Someone please
prepare a patch which fully documents the design and let's get down and
review that. Once that patch is complete, let's then start looking at
the implementation.

2014-06-23 10:00:46

by Mel Gorman

[permalink] [raw]
Subject: Re: [PATCH] mm/vmscan.c: fix an implementation flaw in proportional scanning

On Thu, Jun 19, 2014 at 01:13:22PM -0700, Andrew Morton wrote:
> On Thu, 19 Jun 2014 10:02:39 +0900 Minchan Kim <[email protected]> wrote:
>
> > > > @@ -2057,8 +2057,7 @@ out:
> > > > static void shrink_lruvec(struct lruvec *lruvec, struct scan_control
> > > > *sc)
> > > > {
> > > > unsigned long nr[NR_LRU_LISTS];
> > > > - unsigned long targets[NR_LRU_LISTS];
> > > > - unsigned long nr_to_scan;
> > > > + unsigned long file_target, anon_target;
> > > >
> > > > >From the above snippet, we can know that the "percent" locals come from
> > > > targets[NR_LRU_LISTS]. So this fix does not increase the stack.
> > >
> > > OK. But I expect the stack use could be decreased by using more
> > > complex expressions.
> >
> > I didn't look at this patch yet but want to say.
> >
> > The expression is not easy to follow since several people already
> > confused/discuss/fixed a bit so I'd like to put more concern to clarity
> > rather than stack footprint.
>
> That code is absolutely awful :( It's terribly difficult to work out
> what the design is - what the code is actually setting out to achieve.
> One is reduced to trying to reverse-engineer the intent from the
> implementation and that becomes near impossible when the
> implementation has bugs!
>
> Look at this miserable comment:
>
> /*
> * For kswapd and memcg, reclaim at least the number of pages
> * requested. Ensure that the anon and file LRUs are scanned
> * proportionally what was requested by get_scan_count(). We
> * stop reclaiming one LRU and reduce the amount scanning
> * proportional to the original scan target.
> */
>
>
> > For kswapd and memcg, reclaim at least the number of pages
> > requested.
>
> *why*?
>

At the time of writing the intention was to reduce direct reclaim stall
latency in the global case. Initially the following block was above it

/*
* For global direct reclaim, reclaim only the number of pages
* requested. Less care is taken to scan proportionally as it
* is more important to minimise direct reclaim stall latency
* than it is to properly age the LRU lists.
*/
if (global_reclaim(sc) && !current_is_kswapd())
break;

When that comment was removed then the remaining comment is less clear.


> > Ensure that the anon and file LRUs are scanned
> > proportionally what was requested by get_scan_count().
>
> Ungramattical. Lacks specificity. Fails to explain *why*.
>

In the normal case, file/anon LRUs are scanned at a rate proportional
to the value of vm.swappiness. get_scan_count() calculates the number of
pages to scan from each LRU taking into account additional factors such
as the availability of swap. When the requested number of pages have been
reclaimed we adjust to scan targets to minimise the number of pages scanned
while maintaining the ratio of file/anon pages that are scanned.

--
Mel Gorman
SUSE Labs

2014-06-23 11:02:47

by Mel Gorman

[permalink] [raw]
Subject: Re: [PATCH] mm/vmscan.c: fix an implementation flaw in proportional scanning

On Tue, Jun 17, 2014 at 12:55:02PM +0800, Chen Yucong wrote:
> Via https://lkml.org/lkml/2013/4/10/897, we can know that the relative design
> idea is to keep
>
> scan_target[anon] : scan_target[file]
> == really_scanned_num[anon] : really_scanned_num[file]
>
> But we can find the following snippet in shrink_lruvec():
>
> if (nr_file > nr_anon) {
> ...
> } else {
> ...
> }
>

This is to preserve the ratio of scanning between the lists once the
reclaim target has been reached. One list scanning stops and the other
continues until the proportional number of pages have been scanned.

> However, the above code fragment broke the design idea. We can assume:
>
> nr[LRU_ACTIVE_FILE] = 30
> nr[LRU_INACTIVE_FILE] = 30
> nr[LRU_ACTIVE_ANON] = 0
> nr[LRU_INACTIVE_ANON] = 40
>
> When the value of (nr_reclaimed < nr_to_reclaim) become false, there are
> the following results:
>
> nr[LRU_ACTIVE_FILE] = 15
> nr[LRU_INACTIVE_FILE] = 15
> nr[LRU_ACTIVE_ANON] = 0
> nr[LRU_INACTIVE_ANON] = 25
> nr_file = 30
> nr_anon = 25
> file_percent = 30 / 60 = 0.5
> anon_percent = 25 / 40 = 0.65
>

The original proportion was

file_percent = 60
anon_percent = 40

We check nr_file > nr_anon based on the remaining scan counts in nr[].
We recheck what proportion the larger LRU should be scanned based on targets[]

> According to the above design idea, we should scan some pages from ANON,
> but in fact we execute the an error code path due to "if (nr_file > nr_anon)".
> In this way, nr[lru] is likely to be a negative number. Luckily,
> "nr[lru] -= min(nr[lru], nr_scanned)" can help us to filter this situation,
> but it has rebelled against our design idea.
>

What problem did you encounter? What is the measurable impact of the
patch? One of the reasons why I have taken so long to look at this is
because this information was missing.

The original series that introduced this proportional reclaim was
related to kswapd scanning excessively and swapping pages due to heavy
writing IO. The overall intent of that series was to prevent kswapd
scanning excessively while preserving the property that it scan
file/anon LRU lists proportional to vm.swappiness.

The primary test case used to measure that was memcachetest with varying
amounts of IO in the background and monitoring the reclaim activity
(https://lwn.net/Articles/551643/). Later postmark, ffsb, dd of a
large file and a test that measured mmap latency during IO was used
(http://lwn.net/Articles/600145/).

In the memcachetest case, it was demonstrated that we no longer swapped
processes just because there was some IO. That test case may still be
useful for demonstrating problems with proportional reclaim but the
"stutter" test that measured mmap latency during IO might be easier. The
ideal test case would be the one you used for testing this patch and
verifying it worked as expected.

The point is that even though flaws were discovered later there was still
data supporting the inclusion of the original patches.

I did not find an example of where I talked about it publicly but at one
point I used the mm_vmscan_lru_shrink_inactive tracepoint to verify that
that lists were being scanned proportionally. At the time I would have used a
fixed version of Documentation/trace/postprocess/trace-vmscan-postprocess.pl.
mmtests has an equivalent script but it does not currently support reporting
the proportion of anon/file pages scanned. It should be easy enough to
generate a quick script that checks the file/anon rate of scanning with
and without your patch applied


> Signed-off-by: Chen Yucong <[email protected]>
> ---
> mm/vmscan.c | 39 ++++++++++++++++++---------------------
> 1 file changed, 18 insertions(+), 21 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index a8ffe4e..2c35e34 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2057,8 +2057,7 @@ out:
> static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> {
> unsigned long nr[NR_LRU_LISTS];
> - unsigned long targets[NR_LRU_LISTS];
> - unsigned long nr_to_scan;
> + unsigned long file_target, anon_target;
> enum lru_list lru;
> unsigned long nr_reclaimed = 0;
> unsigned long nr_to_reclaim = sc->nr_to_reclaim;
> @@ -2067,8 +2066,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
>
> get_scan_count(lruvec, sc, nr);
>
> - /* Record the original scan target for proportional adjustments later */
> - memcpy(targets, nr, sizeof(nr));
> + file_target = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
> + anon_target = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
>
> /*
> * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
> @@ -2087,8 +2086,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> blk_start_plug(&plug);
> while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
> nr[LRU_INACTIVE_FILE]) {
> - unsigned long nr_anon, nr_file, percentage;
> - unsigned long nr_scanned;
> + unsigned long nr_anon, nr_file, file_percent, anon_percent;
> + unsigned long nr_to_scan, nr_scanned, percentage;
>
> for_each_evictable_lru(lru) {
> if (nr[lru]) {
> @@ -2122,16 +2121,19 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> if (!nr_file || !nr_anon)
> break;
>
> - if (nr_file > nr_anon) {
> - unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
> - targets[LRU_ACTIVE_ANON] + 1;
> + file_percent = nr_file * 100 / file_target;
> + anon_percent = nr_anon * 100 / anon_target;
> +
> + if (file_percent > anon_percent) {

About all that can be said here is the code is different but not obviously
better or worse because there is no supporting data for your case.

In the original code, it was assumed that we scanned the LRUs in batches
of SWAP_CLUSTER_MAX until the requested number of pages were reclaimed.
Assuming the scan counts do not reach zero prematurely, the ratio
between nr_file/nr_anon should remain constant. Whether we check the
remaining counts or the percentages should be irrelevant.

> lru = LRU_BASE;
> - percentage = nr_anon * 100 / scan_target;
> + nr_scanned = file_target - nr_file;
> + nr_to_scan = file_target * (100 - anon_percent) / 100;
> + percentage = nr[LRU_FILE] * 100 / nr_file;
> } else {
> - unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
> - targets[LRU_ACTIVE_FILE] + 1;
> lru = LRU_FILE;
> - percentage = nr_file * 100 / scan_target;
> + nr_scanned = anon_target - nr_anon;
> + nr_to_scan = anon_target * (100 - file_percent) / 100;
> + percentage = nr[LRU_BASE] * 100 / nr_anon;
> }
>
> /* Stop scanning the smaller of the LRU */


There is some merit to recording the file_percentage and anon_percentage
in advance, removing the need for the targets[] array and adjust the
inactive/active lists by the scanning targets but the changelog should
include the anon/file scan rates before and after.


> @@ -2143,14 +2145,9 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> * scan target and the percentage scanning already complete
> */
> lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
> - nr_scanned = targets[lru] - nr[lru];
> - nr[lru] = targets[lru] * (100 - percentage) / 100;
> - nr[lru] -= min(nr[lru], nr_scanned);
> -
> - lru += LRU_ACTIVE;
> - nr_scanned = targets[lru] - nr[lru];
> - nr[lru] = targets[lru] * (100 - percentage) / 100;
> - nr[lru] -= min(nr[lru], nr_scanned);
> + nr_to_scan -= min(nr_to_scan, nr_scanned);
> + nr[lru] = nr_to_scan * percentage / 100;
> + nr[lru + LRU_ACTIVE] = nr_to_scan - nr[lru];
>
> scan_adjusted = true;
> }

--
Mel Gorman
SUSE Labs