2019-04-17 07:49:42

by Zhaoyang Huang

[permalink] [raw]
Subject: [RFC PATCH] mm/workingset : judge file page activity via timestamp

From: Zhaoyang Huang <[email protected]>

This patch introduce timestamp into workingset's entry and judge if the page
is active or inactive via active_file/refault_ratio instead of refault distance.

The original thought is coming from the logs we got from trace_printk in this
patch, we can find about 1/5 of the file pages' refault are under the
scenario[1],which will be counted as inactive as they have a long refault distance
in between access. However, we can also know from the time information that the
page refault quickly as comparing to the average refault time which is calculated
by the number of active file and refault ratio. We want to save these kinds of
pages from evicted earlier as it used to be. The refault ratio is the value
which can reflect lru's average file access frequency and also can be deemed as a
prediction of future.

The patch is tested on an android system and reduce 30% of page faults, while
60% of the pages remain the original status as (refault_distance < active_file)
indicates. Pages status got from ftrace during the test can refer to [2].

[1]
system_server workingset_refault: WKST_ACT[0]:rft_dis 265976, act_file 34268 rft_ratio 3047 rft_time 0 avg_rft_time 11 refault 295592 eviction 29616 secs 97 pre_secs 97
HwBinder:922 workingset_refault: WKST_ACT[0]:rft_dis 264478, act_file 35037 rft_ratio 3070 rft_time 2 avg_rft_time 11 refault 310078 eviction 45600 secs 101 pre_secs 99

[2]
WKST_ACT[0]: original--INACTIVE commit--ACTIVE
WKST_ACT[1]: original--ACTIVE commit--ACTIVE
WKST_INACT[0]: original--INACTIVE commit--INACTIVE
WKST_INACT[1]: original--ACTIVE commit--INACTIVE

Signed-off-by: Zhaoyang Huang <[email protected]>
---
include/linux/mmzone.h | 1 +
mm/workingset.c | 120 +++++++++++++++++++++++++++++++++++++++++++++----
2 files changed, 112 insertions(+), 9 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 32699b2..6f30673 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -240,6 +240,7 @@ struct lruvec {
atomic_long_t inactive_age;
/* Refaults at the time of last reclaim cycle */
unsigned long refaults;
+ atomic_long_t refaults_ratio;
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
#endif
diff --git a/mm/workingset.c b/mm/workingset.c
index 40ee02c..66c177b 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -160,6 +160,21 @@
MEM_CGROUP_ID_SHIFT)
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)

+#ifdef CONFIG_64BIT
+#define EVICTION_SECS_POS_SHIFT 20
+#define EVICTION_SECS_SHRINK_SHIFT 4
+#define EVICTION_SECS_POS_MASK ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
+#else
+#ifndef CONFIG_MEMCG
+#define EVICTION_SECS_POS_SHIFT 12
+#define EVICTION_SECS_SHRINK_SHIFT 4
+#define EVICTION_SECS_POS_MASK ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
+#else
+#define EVICTION_SECS_POS_SHIFT 0
+#define EVICTION_SECS_SHRINK_SHIFT 0
+#define NO_SECS_IN_WORKINGSET
+#endif
+#endif
/*
* Eviction timestamps need to be able to cover the full range of
* actionable refaults. However, bits are tight in the radix tree
@@ -169,10 +184,54 @@
* evictions into coarser buckets by shaving off lower timestamp bits.
*/
static unsigned int bucket_order __read_mostly;
-
+#ifdef NO_SECS_IN_WORKINGSET
+static void pack_secs(unsigned long *peviction) { }
+static unsigned int unpack_secs(unsigned long entry) {return 0; }
+#else
+/*
+ * Shrink the timestamp according to its value and store it together
+ * with the shrink size in the entry.
+ */
+static void pack_secs(unsigned long *peviction)
+{
+ unsigned int secs;
+ unsigned long eviction;
+ int order;
+ int secs_shrink_size;
+ struct timespec ts;
+
+ get_monotonic_boottime(&ts);
+ secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1;
+ order = get_count_order(secs);
+ secs_shrink_size = (order <= EVICTION_SECS_POS_SHIFT)
+ ? 0 : (order - EVICTION_SECS_POS_SHIFT);
+
+ eviction = *peviction;
+ eviction = (eviction << EVICTION_SECS_POS_SHIFT)
+ | ((secs >> secs_shrink_size) & EVICTION_SECS_POS_MASK);
+ eviction = (eviction << EVICTION_SECS_SHRINK_SHIFT) | (secs_shrink_size & 0xf);
+ *peviction = eviction;
+}
+/*
+ * Unpack the second from the entry and restore the value according to the
+ * shrink size.
+ */
+static unsigned int unpack_secs(unsigned long entry)
+{
+ unsigned int secs;
+ int secs_shrink_size;
+
+ secs_shrink_size = entry & ((1 << EVICTION_SECS_SHRINK_SHIFT) - 1);
+ entry >>= EVICTION_SECS_SHRINK_SHIFT;
+ secs = entry & EVICTION_SECS_POS_MASK;
+ secs = secs << secs_shrink_size;
+ return secs;
+}
+#endif
static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
{
eviction >>= bucket_order;
+ pack_secs(&eviction);
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
@@ -181,20 +240,24 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
}

static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
- unsigned long *evictionp)
+ unsigned long *evictionp, unsigned int *prev_secs)
{
unsigned long entry = (unsigned long)shadow;
int memcgid, nid;
+ unsigned int secs;

entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
entry >>= MEM_CGROUP_ID_SHIFT;
+ secs = unpack_secs(entry);
+ entry >>= (EVICTION_SECS_POS_SHIFT + EVICTION_SECS_SHRINK_SHIFT);

*memcgidp = memcgid;
*pgdat = NODE_DATA(nid);
*evictionp = entry << bucket_order;
+ *prev_secs = secs;
}

/**
@@ -242,9 +305,22 @@ bool workingset_refault(void *shadow)
unsigned long refault;
struct pglist_data *pgdat;
int memcgid;
+#ifndef NO_SECS_IN_WORKINGSET
+ unsigned long avg_refault_time;
+ unsigned long refault_time;
+ int tradition;
+ unsigned int prev_secs;
+ unsigned int secs;
+ unsigned long refaults_ratio;
+#endif
+ struct timespec ts;
+ /*
+ convert jiffies to second
+ */
+ get_monotonic_boottime(&ts);
+ secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1;

- unpack_shadow(shadow, &memcgid, &pgdat, &eviction);
-
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &prev_secs);
rcu_read_lock();
/*
* Look up the memcg associated with the stored ID. It might
@@ -288,14 +364,37 @@ bool workingset_refault(void *shadow)
* list is not a problem.
*/
refault_distance = (refault - eviction) & EVICTION_MASK;
-
inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
-
- if (refault_distance <= active_file) {
+#ifndef NO_SECS_IN_WORKINGSET
+ refaults_ratio = (atomic_long_read(&lruvec->inactive_age) + 1) / secs;
+ atomic_long_set(&lruvec->refaults_ratio, refaults_ratio);
+ refault_time = secs - prev_secs;
+ avg_refault_time = active_file / refaults_ratio;
+ tradition = !!(refault_distance < active_file);
+ if (refault_time <= avg_refault_time) {
+#else
+ if (refault_distance < active_file) {
+#endif
inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+#ifndef NO_SECS_IN_WORKINGSET
+ trace_printk("WKST_ACT[%d]:rft_dis %ld, act_file %ld \
+ rft_ratio %ld rft_time %ld avg_rft_time %ld \
+ refault %ld eviction %ld secs %d pre_secs %d\n",
+ tradition, refault_distance, active_file,
+ refaults_ratio, refault_time, avg_refault_time,
+ refault, eviction, secs, prev_secs);
+#endif
rcu_read_unlock();
return true;
}
+#ifndef NO_SECS_IN_WORKINGSET
+ trace_printk("WKST_INACT[%d]:rft_dis %ld, act_file %ld \
+ rft_ratio %ld rft_time %ld avg_rft_time %ld \
+ refault %ld eviction %ld secs %d pre_secs %d\n",
+ tradition, refault_distance, active_file,
+ refaults_ratio, refault_time, avg_refault_time,
+ refault, eviction, secs, prev_secs);
+#endif
rcu_read_unlock();
return false;
}
@@ -513,7 +612,9 @@ static int __init workingset_init(void)
unsigned int max_order;
int ret;

- BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
+ BUILD_BUG_ON(BITS_PER_LONG < (EVICTION_SHIFT
+ + EVICTION_SECS_POS_SHIFT
+ + EVICTION_SECS_SHRINK_SHIFT));
/*
* Calculate the eviction bucket size to cover the longest
* actionable refault distance, which is currently half of
@@ -521,7 +622,8 @@ static int __init workingset_init(void)
* some more pages at runtime, so keep working with up to
* double the initial memory by using totalram_pages as-is.
*/
- timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
+ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT
+ - EVICTION_SECS_POS_SHIFT - EVICTION_SECS_SHRINK_SHIFT;
max_order = fls_long(totalram_pages - 1);
if (max_order > timestamp_bits)
bucket_order = max_order - timestamp_bits;
--
1.9.1


2019-04-17 08:00:36

by Zhaoyang Huang

[permalink] [raw]
Subject: Re: [RFC PATCH] mm/workingset : judge file page activity via timestamp

add Johannes and answer his previous question.

@Johannes Weiner
Yes. I do agree with you about the original thought of sacrificing
long distance access pages when huge memory demands arise. The problem
is what is the criteria of the distance, which you can find from what
I comment in the patch, that is, some pages have long refault_distance
while having a very short access time in between. I think the latter
one should be take into consideration or as part of the finnal
decision of if the page should be active/inactive.

On Wed, Apr 17, 2019 at 3:48 PM Zhaoyang Huang <[email protected]> wrote:
>
> From: Zhaoyang Huang <[email protected]>
>
> This patch introduce timestamp into workingset's entry and judge if the page
> is active or inactive via active_file/refault_ratio instead of refault distance.
>
> The original thought is coming from the logs we got from trace_printk in this
> patch, we can find about 1/5 of the file pages' refault are under the
> scenario[1],which will be counted as inactive as they have a long refault distance
> in between access. However, we can also know from the time information that the
> page refault quickly as comparing to the average refault time which is calculated
> by the number of active file and refault ratio. We want to save these kinds of
> pages from evicted earlier as it used to be. The refault ratio is the value
> which can reflect lru's average file access frequency and also can be deemed as a
> prediction of future.
>
> The patch is tested on an android system and reduce 30% of page faults, while
> 60% of the pages remain the original status as (refault_distance < active_file)
> indicates. Pages status got from ftrace during the test can refer to [2].
>
> [1]
> system_server workingset_refault: WKST_ACT[0]:rft_dis 265976, act_file 34268 rft_ratio 3047 rft_time 0 avg_rft_time 11 refault 295592 eviction 29616 secs 97 pre_secs 97
> HwBinder:922 workingset_refault: WKST_ACT[0]:rft_dis 264478, act_file 35037 rft_ratio 3070 rft_time 2 avg_rft_time 11 refault 310078 eviction 45600 secs 101 pre_secs 99
>
> [2]
> WKST_ACT[0]: original--INACTIVE commit--ACTIVE
> WKST_ACT[1]: original--ACTIVE commit--ACTIVE
> WKST_INACT[0]: original--INACTIVE commit--INACTIVE
> WKST_INACT[1]: original--ACTIVE commit--INACTIVE
>
> Signed-off-by: Zhaoyang Huang <[email protected]>
> ---
> include/linux/mmzone.h | 1 +
> mm/workingset.c | 120 +++++++++++++++++++++++++++++++++++++++++++++----
> 2 files changed, 112 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 32699b2..6f30673 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -240,6 +240,7 @@ struct lruvec {
> atomic_long_t inactive_age;
> /* Refaults at the time of last reclaim cycle */
> unsigned long refaults;
> + atomic_long_t refaults_ratio;
> #ifdef CONFIG_MEMCG
> struct pglist_data *pgdat;
> #endif
> diff --git a/mm/workingset.c b/mm/workingset.c
> index 40ee02c..66c177b 100644
> --- a/mm/workingset.c
> +++ b/mm/workingset.c
> @@ -160,6 +160,21 @@
> MEM_CGROUP_ID_SHIFT)
> #define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
>
> +#ifdef CONFIG_64BIT
> +#define EVICTION_SECS_POS_SHIFT 20
> +#define EVICTION_SECS_SHRINK_SHIFT 4
> +#define EVICTION_SECS_POS_MASK ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
> +#else
> +#ifndef CONFIG_MEMCG
> +#define EVICTION_SECS_POS_SHIFT 12
> +#define EVICTION_SECS_SHRINK_SHIFT 4
> +#define EVICTION_SECS_POS_MASK ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
> +#else
> +#define EVICTION_SECS_POS_SHIFT 0
> +#define EVICTION_SECS_SHRINK_SHIFT 0
> +#define NO_SECS_IN_WORKINGSET
> +#endif
> +#endif
> /*
> * Eviction timestamps need to be able to cover the full range of
> * actionable refaults. However, bits are tight in the radix tree
> @@ -169,10 +184,54 @@
> * evictions into coarser buckets by shaving off lower timestamp bits.
> */
> static unsigned int bucket_order __read_mostly;
> -
> +#ifdef NO_SECS_IN_WORKINGSET
> +static void pack_secs(unsigned long *peviction) { }
> +static unsigned int unpack_secs(unsigned long entry) {return 0; }
> +#else
> +/*
> + * Shrink the timestamp according to its value and store it together
> + * with the shrink size in the entry.
> + */
> +static void pack_secs(unsigned long *peviction)
> +{
> + unsigned int secs;
> + unsigned long eviction;
> + int order;
> + int secs_shrink_size;
> + struct timespec ts;
> +
> + get_monotonic_boottime(&ts);
> + secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1;
> + order = get_count_order(secs);
> + secs_shrink_size = (order <= EVICTION_SECS_POS_SHIFT)
> + ? 0 : (order - EVICTION_SECS_POS_SHIFT);
> +
> + eviction = *peviction;
> + eviction = (eviction << EVICTION_SECS_POS_SHIFT)
> + | ((secs >> secs_shrink_size) & EVICTION_SECS_POS_MASK);
> + eviction = (eviction << EVICTION_SECS_SHRINK_SHIFT) | (secs_shrink_size & 0xf);
> + *peviction = eviction;
> +}
> +/*
> + * Unpack the second from the entry and restore the value according to the
> + * shrink size.
> + */
> +static unsigned int unpack_secs(unsigned long entry)
> +{
> + unsigned int secs;
> + int secs_shrink_size;
> +
> + secs_shrink_size = entry & ((1 << EVICTION_SECS_SHRINK_SHIFT) - 1);
> + entry >>= EVICTION_SECS_SHRINK_SHIFT;
> + secs = entry & EVICTION_SECS_POS_MASK;
> + secs = secs << secs_shrink_size;
> + return secs;
> +}
> +#endif
> static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
> {
> eviction >>= bucket_order;
> + pack_secs(&eviction);
> eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
> eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
> eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
> @@ -181,20 +240,24 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
> }
>
> static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
> - unsigned long *evictionp)
> + unsigned long *evictionp, unsigned int *prev_secs)
> {
> unsigned long entry = (unsigned long)shadow;
> int memcgid, nid;
> + unsigned int secs;
>
> entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
> nid = entry & ((1UL << NODES_SHIFT) - 1);
> entry >>= NODES_SHIFT;
> memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
> entry >>= MEM_CGROUP_ID_SHIFT;
> + secs = unpack_secs(entry);
> + entry >>= (EVICTION_SECS_POS_SHIFT + EVICTION_SECS_SHRINK_SHIFT);
>
> *memcgidp = memcgid;
> *pgdat = NODE_DATA(nid);
> *evictionp = entry << bucket_order;
> + *prev_secs = secs;
> }
>
> /**
> @@ -242,9 +305,22 @@ bool workingset_refault(void *shadow)
> unsigned long refault;
> struct pglist_data *pgdat;
> int memcgid;
> +#ifndef NO_SECS_IN_WORKINGSET
> + unsigned long avg_refault_time;
> + unsigned long refault_time;
> + int tradition;
> + unsigned int prev_secs;
> + unsigned int secs;
> + unsigned long refaults_ratio;
> +#endif
> + struct timespec ts;
> + /*
> + convert jiffies to second
> + */
> + get_monotonic_boottime(&ts);
> + secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1;
>
> - unpack_shadow(shadow, &memcgid, &pgdat, &eviction);
> -
> + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &prev_secs);
> rcu_read_lock();
> /*
> * Look up the memcg associated with the stored ID. It might
> @@ -288,14 +364,37 @@ bool workingset_refault(void *shadow)
> * list is not a problem.
> */
> refault_distance = (refault - eviction) & EVICTION_MASK;
> -
> inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
> -
> - if (refault_distance <= active_file) {
> +#ifndef NO_SECS_IN_WORKINGSET
> + refaults_ratio = (atomic_long_read(&lruvec->inactive_age) + 1) / secs;
> + atomic_long_set(&lruvec->refaults_ratio, refaults_ratio);
> + refault_time = secs - prev_secs;
> + avg_refault_time = active_file / refaults_ratio;
> + tradition = !!(refault_distance < active_file);
> + if (refault_time <= avg_refault_time) {
> +#else
> + if (refault_distance < active_file) {
> +#endif
> inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
> +#ifndef NO_SECS_IN_WORKINGSET
> + trace_printk("WKST_ACT[%d]:rft_dis %ld, act_file %ld \
> + rft_ratio %ld rft_time %ld avg_rft_time %ld \
> + refault %ld eviction %ld secs %d pre_secs %d\n",
> + tradition, refault_distance, active_file,
> + refaults_ratio, refault_time, avg_refault_time,
> + refault, eviction, secs, prev_secs);
> +#endif
> rcu_read_unlock();
> return true;
> }
> +#ifndef NO_SECS_IN_WORKINGSET
> + trace_printk("WKST_INACT[%d]:rft_dis %ld, act_file %ld \
> + rft_ratio %ld rft_time %ld avg_rft_time %ld \
> + refault %ld eviction %ld secs %d pre_secs %d\n",
> + tradition, refault_distance, active_file,
> + refaults_ratio, refault_time, avg_refault_time,
> + refault, eviction, secs, prev_secs);
> +#endif
> rcu_read_unlock();
> return false;
> }
> @@ -513,7 +612,9 @@ static int __init workingset_init(void)
> unsigned int max_order;
> int ret;
>
> - BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
> + BUILD_BUG_ON(BITS_PER_LONG < (EVICTION_SHIFT
> + + EVICTION_SECS_POS_SHIFT
> + + EVICTION_SECS_SHRINK_SHIFT));
> /*
> * Calculate the eviction bucket size to cover the longest
> * actionable refault distance, which is currently half of
> @@ -521,7 +622,8 @@ static int __init workingset_init(void)
> * some more pages at runtime, so keep working with up to
> * double the initial memory by using totalram_pages as-is.
> */
> - timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
> + timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT
> + - EVICTION_SECS_POS_SHIFT - EVICTION_SECS_SHRINK_SHIFT;
> max_order = fls_long(totalram_pages - 1);
> if (max_order > timestamp_bits)
> bucket_order = max_order - timestamp_bits;
> --
> 1.9.1
>

2019-04-17 08:46:27

by Michal Hocko

[permalink] [raw]
Subject: Re: [RFC PATCH] mm/workingset : judge file page activity via timestamp

Hi,
I do not see http://lkml.kernel.org/r/[email protected]
discussion reaching a conlusion to change the current workingset
implementation. Therefore is there any reason to post a new version of
the patch? If yes it would be really great to see a short summary about
how this version is different from the previous one and how all the
review feedback has been addressed.

On Wed 17-04-19 15:47:26, Zhaoyang Huang wrote:
> From: Zhaoyang Huang <[email protected]>
>
> This patch introduce timestamp into workingset's entry and judge if the page
> is active or inactive via active_file/refault_ratio instead of refault distance.
>
> The original thought is coming from the logs we got from trace_printk in this
> patch, we can find about 1/5 of the file pages' refault are under the
> scenario[1],which will be counted as inactive as they have a long refault distance
> in between access. However, we can also know from the time information that the
> page refault quickly as comparing to the average refault time which is calculated
> by the number of active file and refault ratio. We want to save these kinds of
> pages from evicted earlier as it used to be. The refault ratio is the value
> which can reflect lru's average file access frequency and also can be deemed as a
> prediction of future.
>
> The patch is tested on an android system and reduce 30% of page faults, while
> 60% of the pages remain the original status as (refault_distance < active_file)
> indicates. Pages status got from ftrace during the test can refer to [2].
>
> [1]
> system_server workingset_refault: WKST_ACT[0]:rft_dis 265976, act_file 34268 rft_ratio 3047 rft_time 0 avg_rft_time 11 refault 295592 eviction 29616 secs 97 pre_secs 97
> HwBinder:922 workingset_refault: WKST_ACT[0]:rft_dis 264478, act_file 35037 rft_ratio 3070 rft_time 2 avg_rft_time 11 refault 310078 eviction 45600 secs 101 pre_secs 99
>
> [2]
> WKST_ACT[0]: original--INACTIVE commit--ACTIVE
> WKST_ACT[1]: original--ACTIVE commit--ACTIVE
> WKST_INACT[0]: original--INACTIVE commit--INACTIVE
> WKST_INACT[1]: original--ACTIVE commit--INACTIVE
>
> Signed-off-by: Zhaoyang Huang <[email protected]>
> ---
> include/linux/mmzone.h | 1 +
> mm/workingset.c | 120 +++++++++++++++++++++++++++++++++++++++++++++----
> 2 files changed, 112 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 32699b2..6f30673 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -240,6 +240,7 @@ struct lruvec {
> atomic_long_t inactive_age;
> /* Refaults at the time of last reclaim cycle */
> unsigned long refaults;
> + atomic_long_t refaults_ratio;
> #ifdef CONFIG_MEMCG
> struct pglist_data *pgdat;
> #endif
> diff --git a/mm/workingset.c b/mm/workingset.c
> index 40ee02c..66c177b 100644
> --- a/mm/workingset.c
> +++ b/mm/workingset.c
> @@ -160,6 +160,21 @@
> MEM_CGROUP_ID_SHIFT)
> #define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
>
> +#ifdef CONFIG_64BIT
> +#define EVICTION_SECS_POS_SHIFT 20
> +#define EVICTION_SECS_SHRINK_SHIFT 4
> +#define EVICTION_SECS_POS_MASK ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
> +#else
> +#ifndef CONFIG_MEMCG
> +#define EVICTION_SECS_POS_SHIFT 12
> +#define EVICTION_SECS_SHRINK_SHIFT 4
> +#define EVICTION_SECS_POS_MASK ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
> +#else
> +#define EVICTION_SECS_POS_SHIFT 0
> +#define EVICTION_SECS_SHRINK_SHIFT 0
> +#define NO_SECS_IN_WORKINGSET
> +#endif
> +#endif
> /*
> * Eviction timestamps need to be able to cover the full range of
> * actionable refaults. However, bits are tight in the radix tree
> @@ -169,10 +184,54 @@
> * evictions into coarser buckets by shaving off lower timestamp bits.
> */
> static unsigned int bucket_order __read_mostly;
> -
> +#ifdef NO_SECS_IN_WORKINGSET
> +static void pack_secs(unsigned long *peviction) { }
> +static unsigned int unpack_secs(unsigned long entry) {return 0; }
> +#else
> +/*
> + * Shrink the timestamp according to its value and store it together
> + * with the shrink size in the entry.
> + */
> +static void pack_secs(unsigned long *peviction)
> +{
> + unsigned int secs;
> + unsigned long eviction;
> + int order;
> + int secs_shrink_size;
> + struct timespec ts;
> +
> + get_monotonic_boottime(&ts);
> + secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1;
> + order = get_count_order(secs);
> + secs_shrink_size = (order <= EVICTION_SECS_POS_SHIFT)
> + ? 0 : (order - EVICTION_SECS_POS_SHIFT);
> +
> + eviction = *peviction;
> + eviction = (eviction << EVICTION_SECS_POS_SHIFT)
> + | ((secs >> secs_shrink_size) & EVICTION_SECS_POS_MASK);
> + eviction = (eviction << EVICTION_SECS_SHRINK_SHIFT) | (secs_shrink_size & 0xf);
> + *peviction = eviction;
> +}
> +/*
> + * Unpack the second from the entry and restore the value according to the
> + * shrink size.
> + */
> +static unsigned int unpack_secs(unsigned long entry)
> +{
> + unsigned int secs;
> + int secs_shrink_size;
> +
> + secs_shrink_size = entry & ((1 << EVICTION_SECS_SHRINK_SHIFT) - 1);
> + entry >>= EVICTION_SECS_SHRINK_SHIFT;
> + secs = entry & EVICTION_SECS_POS_MASK;
> + secs = secs << secs_shrink_size;
> + return secs;
> +}
> +#endif
> static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
> {
> eviction >>= bucket_order;
> + pack_secs(&eviction);
> eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
> eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
> eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
> @@ -181,20 +240,24 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
> }
>
> static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
> - unsigned long *evictionp)
> + unsigned long *evictionp, unsigned int *prev_secs)
> {
> unsigned long entry = (unsigned long)shadow;
> int memcgid, nid;
> + unsigned int secs;
>
> entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
> nid = entry & ((1UL << NODES_SHIFT) - 1);
> entry >>= NODES_SHIFT;
> memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
> entry >>= MEM_CGROUP_ID_SHIFT;
> + secs = unpack_secs(entry);
> + entry >>= (EVICTION_SECS_POS_SHIFT + EVICTION_SECS_SHRINK_SHIFT);
>
> *memcgidp = memcgid;
> *pgdat = NODE_DATA(nid);
> *evictionp = entry << bucket_order;
> + *prev_secs = secs;
> }
>
> /**
> @@ -242,9 +305,22 @@ bool workingset_refault(void *shadow)
> unsigned long refault;
> struct pglist_data *pgdat;
> int memcgid;
> +#ifndef NO_SECS_IN_WORKINGSET
> + unsigned long avg_refault_time;
> + unsigned long refault_time;
> + int tradition;
> + unsigned int prev_secs;
> + unsigned int secs;
> + unsigned long refaults_ratio;
> +#endif
> + struct timespec ts;
> + /*
> + convert jiffies to second
> + */
> + get_monotonic_boottime(&ts);
> + secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1;
>
> - unpack_shadow(shadow, &memcgid, &pgdat, &eviction);
> -
> + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &prev_secs);
> rcu_read_lock();
> /*
> * Look up the memcg associated with the stored ID. It might
> @@ -288,14 +364,37 @@ bool workingset_refault(void *shadow)
> * list is not a problem.
> */
> refault_distance = (refault - eviction) & EVICTION_MASK;
> -
> inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
> -
> - if (refault_distance <= active_file) {
> +#ifndef NO_SECS_IN_WORKINGSET
> + refaults_ratio = (atomic_long_read(&lruvec->inactive_age) + 1) / secs;
> + atomic_long_set(&lruvec->refaults_ratio, refaults_ratio);
> + refault_time = secs - prev_secs;
> + avg_refault_time = active_file / refaults_ratio;
> + tradition = !!(refault_distance < active_file);
> + if (refault_time <= avg_refault_time) {
> +#else
> + if (refault_distance < active_file) {
> +#endif
> inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
> +#ifndef NO_SECS_IN_WORKINGSET
> + trace_printk("WKST_ACT[%d]:rft_dis %ld, act_file %ld \
> + rft_ratio %ld rft_time %ld avg_rft_time %ld \
> + refault %ld eviction %ld secs %d pre_secs %d\n",
> + tradition, refault_distance, active_file,
> + refaults_ratio, refault_time, avg_refault_time,
> + refault, eviction, secs, prev_secs);
> +#endif
> rcu_read_unlock();
> return true;
> }
> +#ifndef NO_SECS_IN_WORKINGSET
> + trace_printk("WKST_INACT[%d]:rft_dis %ld, act_file %ld \
> + rft_ratio %ld rft_time %ld avg_rft_time %ld \
> + refault %ld eviction %ld secs %d pre_secs %d\n",
> + tradition, refault_distance, active_file,
> + refaults_ratio, refault_time, avg_refault_time,
> + refault, eviction, secs, prev_secs);
> +#endif
> rcu_read_unlock();
> return false;
> }
> @@ -513,7 +612,9 @@ static int __init workingset_init(void)
> unsigned int max_order;
> int ret;
>
> - BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
> + BUILD_BUG_ON(BITS_PER_LONG < (EVICTION_SHIFT
> + + EVICTION_SECS_POS_SHIFT
> + + EVICTION_SECS_SHRINK_SHIFT));
> /*
> * Calculate the eviction bucket size to cover the longest
> * actionable refault distance, which is currently half of
> @@ -521,7 +622,8 @@ static int __init workingset_init(void)
> * some more pages at runtime, so keep working with up to
> * double the initial memory by using totalram_pages as-is.
> */
> - timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
> + timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT
> + - EVICTION_SECS_POS_SHIFT - EVICTION_SECS_SHRINK_SHIFT;
> max_order = fls_long(totalram_pages - 1);
> if (max_order > timestamp_bits)
> bucket_order = max_order - timestamp_bits;
> --
> 1.9.1

--
Michal Hocko
SUSE Labs

2019-04-17 10:56:51

by Zhaoyang Huang

[permalink] [raw]
Subject: Re: [RFC PATCH] mm/workingset : judge file page activity via timestamp

fix one mailbox and update for some information

Comparing to http://lkml.kernel.org/r/[email protected],
this commit fix the packing order error and add trace_printk for
reference debug information.

For johannes's comments, please find bellowing for my feedback.



On Wed, Apr 17, 2019 at 3:59 PM Zhaoyang Huang <[email protected]> wrote:
>
> add Johannes and answer his previous question.
>
> @Johannes Weiner
> Yes. I do agree with you about the original thought of sacrificing
> long distance access pages when huge memory demands arise. The problem
> is what is the criteria of the distance, which you can find from what
> I comment in the patch, that is, some pages have long refault_distance
> while having a very short access time in between. I think the latter
> one should be take into consideration or as part of the finnal
> decision of if the page should be active/inactive.
>
> On Wed, Apr 17, 2019 at 3:48 PM Zhaoyang Huang <[email protected]> wrote:
> >
> > From: Zhaoyang Huang <[email protected]>
> >
> > This patch introduce timestamp into workingset's entry and judge if the page
> > is active or inactive via active_file/refault_ratio instead of refault distance.
> >
> > The original thought is coming from the logs we got from trace_printk in this
> > patch, we can find about 1/5 of the file pages' refault are under the
> > scenario[1],which will be counted as inactive as they have a long refault distance
> > in between access. However, we can also know from the time information that the
> > page refault quickly as comparing to the average refault time which is calculated
> > by the number of active file and refault ratio. We want to save these kinds of
> > pages from evicted earlier as it used to be. The refault ratio is the value
> > which can reflect lru's average file access frequency and also can be deemed as a
> > prediction of future.
> >
> > The patch is tested on an android system and reduce 30% of page faults, while
> > 60% of the pages remain the original status as (refault_distance < active_file)
> > indicates. Pages status got from ftrace during the test can refer to [2].
> >
> > [1]
> > system_server workingset_refault: WKST_ACT[0]:rft_dis 265976, act_file 34268 rft_ratio 3047 rft_time 0 avg_rft_time 11 refault 295592 eviction 29616 secs 97 pre_secs 97
> > HwBinder:922 workingset_refault: WKST_ACT[0]:rft_dis 264478, act_file 35037 rft_ratio 3070 rft_time 2 avg_rft_time 11 refault 310078 eviction 45600 secs 101 pre_secs 99
> >
> > [2]
> > WKST_ACT[0]: original--INACTIVE commit--ACTIVE
> > WKST_ACT[1]: original--ACTIVE commit--ACTIVE
> > WKST_INACT[0]: original--INACTIVE commit--INACTIVE
> > WKST_INACT[1]: original--ACTIVE commit--INACTIVE
> >
> > Signed-off-by: Zhaoyang Huang <[email protected]>
> > ---
> > include/linux/mmzone.h | 1 +
> > mm/workingset.c | 120 +++++++++++++++++++++++++++++++++++++++++++++----
> > 2 files changed, 112 insertions(+), 9 deletions(-)
> >
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index 32699b2..6f30673 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -240,6 +240,7 @@ struct lruvec {
> > atomic_long_t inactive_age;
> > /* Refaults at the time of last reclaim cycle */
> > unsigned long refaults;
> > + atomic_long_t refaults_ratio;
> > #ifdef CONFIG_MEMCG
> > struct pglist_data *pgdat;
> > #endif
> > diff --git a/mm/workingset.c b/mm/workingset.c
> > index 40ee02c..66c177b 100644
> > --- a/mm/workingset.c
> > +++ b/mm/workingset.c
> > @@ -160,6 +160,21 @@
> > MEM_CGROUP_ID_SHIFT)
> > #define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
> >
> > +#ifdef CONFIG_64BIT
> > +#define EVICTION_SECS_POS_SHIFT 20
> > +#define EVICTION_SECS_SHRINK_SHIFT 4
> > +#define EVICTION_SECS_POS_MASK ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
> > +#else
> > +#ifndef CONFIG_MEMCG
> > +#define EVICTION_SECS_POS_SHIFT 12
> > +#define EVICTION_SECS_SHRINK_SHIFT 4
> > +#define EVICTION_SECS_POS_MASK ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
> > +#else
> > +#define EVICTION_SECS_POS_SHIFT 0
> > +#define EVICTION_SECS_SHRINK_SHIFT 0
> > +#define NO_SECS_IN_WORKINGSET
> > +#endif
> > +#endif
> > /*
> > * Eviction timestamps need to be able to cover the full range of
> > * actionable refaults. However, bits are tight in the radix tree
> > @@ -169,10 +184,54 @@
> > * evictions into coarser buckets by shaving off lower timestamp bits.
> > */
> > static unsigned int bucket_order __read_mostly;
> > -
> > +#ifdef NO_SECS_IN_WORKINGSET
> > +static void pack_secs(unsigned long *peviction) { }
> > +static unsigned int unpack_secs(unsigned long entry) {return 0; }
> > +#else
> > +/*
> > + * Shrink the timestamp according to its value and store it together
> > + * with the shrink size in the entry.
> > + */
> > +static void pack_secs(unsigned long *peviction)
> > +{
> > + unsigned int secs;
> > + unsigned long eviction;
> > + int order;
> > + int secs_shrink_size;
> > + struct timespec ts;
> > +
> > + get_monotonic_boottime(&ts);
> > + secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1;
> > + order = get_count_order(secs);
> > + secs_shrink_size = (order <= EVICTION_SECS_POS_SHIFT)
> > + ? 0 : (order - EVICTION_SECS_POS_SHIFT);
> > +
> > + eviction = *peviction;
> > + eviction = (eviction << EVICTION_SECS_POS_SHIFT)
> > + | ((secs >> secs_shrink_size) & EVICTION_SECS_POS_MASK);
> > + eviction = (eviction << EVICTION_SECS_SHRINK_SHIFT) | (secs_shrink_size & 0xf);
> > + *peviction = eviction;
> > +}
> > +/*
> > + * Unpack the second from the entry and restore the value according to the
> > + * shrink size.
> > + */
> > +static unsigned int unpack_secs(unsigned long entry)
> > +{
> > + unsigned int secs;
> > + int secs_shrink_size;
> > +
> > + secs_shrink_size = entry & ((1 << EVICTION_SECS_SHRINK_SHIFT) - 1);
> > + entry >>= EVICTION_SECS_SHRINK_SHIFT;
> > + secs = entry & EVICTION_SECS_POS_MASK;
> > + secs = secs << secs_shrink_size;
> > + return secs;
> > +}
> > +#endif
> > static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
> > {
> > eviction >>= bucket_order;
> > + pack_secs(&eviction);
> > eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
> > eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
> > eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
> > @@ -181,20 +240,24 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
> > }
> >
> > static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
> > - unsigned long *evictionp)
> > + unsigned long *evictionp, unsigned int *prev_secs)
> > {
> > unsigned long entry = (unsigned long)shadow;
> > int memcgid, nid;
> > + unsigned int secs;
> >
> > entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
> > nid = entry & ((1UL << NODES_SHIFT) - 1);
> > entry >>= NODES_SHIFT;
> > memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
> > entry >>= MEM_CGROUP_ID_SHIFT;
> > + secs = unpack_secs(entry);
> > + entry >>= (EVICTION_SECS_POS_SHIFT + EVICTION_SECS_SHRINK_SHIFT);
> >
> > *memcgidp = memcgid;
> > *pgdat = NODE_DATA(nid);
> > *evictionp = entry << bucket_order;
> > + *prev_secs = secs;
> > }
> >
> > /**
> > @@ -242,9 +305,22 @@ bool workingset_refault(void *shadow)
> > unsigned long refault;
> > struct pglist_data *pgdat;
> > int memcgid;
> > +#ifndef NO_SECS_IN_WORKINGSET
> > + unsigned long avg_refault_time;
> > + unsigned long refault_time;
> > + int tradition;
> > + unsigned int prev_secs;
> > + unsigned int secs;
> > + unsigned long refaults_ratio;
> > +#endif
> > + struct timespec ts;
> > + /*
> > + convert jiffies to second
> > + */
> > + get_monotonic_boottime(&ts);
> > + secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1;
> >
> > - unpack_shadow(shadow, &memcgid, &pgdat, &eviction);
> > -
> > + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &prev_secs);
> > rcu_read_lock();
> > /*
> > * Look up the memcg associated with the stored ID. It might
> > @@ -288,14 +364,37 @@ bool workingset_refault(void *shadow)
> > * list is not a problem.
> > */
> > refault_distance = (refault - eviction) & EVICTION_MASK;
> > -
> > inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
> > -
> > - if (refault_distance <= active_file) {
> > +#ifndef NO_SECS_IN_WORKINGSET
> > + refaults_ratio = (atomic_long_read(&lruvec->inactive_age) + 1) / secs;
> > + atomic_long_set(&lruvec->refaults_ratio, refaults_ratio);
> > + refault_time = secs - prev_secs;
> > + avg_refault_time = active_file / refaults_ratio;
> > + tradition = !!(refault_distance < active_file);
> > + if (refault_time <= avg_refault_time) {
> > +#else
> > + if (refault_distance < active_file) {
> > +#endif
> > inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
> > +#ifndef NO_SECS_IN_WORKINGSET
> > + trace_printk("WKST_ACT[%d]:rft_dis %ld, act_file %ld \
> > + rft_ratio %ld rft_time %ld avg_rft_time %ld \
> > + refault %ld eviction %ld secs %d pre_secs %d\n",
> > + tradition, refault_distance, active_file,
> > + refaults_ratio, refault_time, avg_refault_time,
> > + refault, eviction, secs, prev_secs);
> > +#endif
> > rcu_read_unlock();
> > return true;
> > }
> > +#ifndef NO_SECS_IN_WORKINGSET
> > + trace_printk("WKST_INACT[%d]:rft_dis %ld, act_file %ld \
> > + rft_ratio %ld rft_time %ld avg_rft_time %ld \
> > + refault %ld eviction %ld secs %d pre_secs %d\n",
> > + tradition, refault_distance, active_file,
> > + refaults_ratio, refault_time, avg_refault_time,
> > + refault, eviction, secs, prev_secs);
> > +#endif
> > rcu_read_unlock();
> > return false;
> > }
> > @@ -513,7 +612,9 @@ static int __init workingset_init(void)
> > unsigned int max_order;
> > int ret;
> >
> > - BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
> > + BUILD_BUG_ON(BITS_PER_LONG < (EVICTION_SHIFT
> > + + EVICTION_SECS_POS_SHIFT
> > + + EVICTION_SECS_SHRINK_SHIFT));
> > /*
> > * Calculate the eviction bucket size to cover the longest
> > * actionable refault distance, which is currently half of
> > @@ -521,7 +622,8 @@ static int __init workingset_init(void)
> > * some more pages at runtime, so keep working with up to
> > * double the initial memory by using totalram_pages as-is.
> > */
> > - timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
> > + timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT
> > + - EVICTION_SECS_POS_SHIFT - EVICTION_SECS_SHRINK_SHIFT;
> > max_order = fls_long(totalram_pages - 1);
> > if (max_order > timestamp_bits)
> > bucket_order = max_order - timestamp_bits;
> > --
> > 1.9.1
> >

2019-04-17 11:07:27

by Michal Hocko

[permalink] [raw]
Subject: Re: [RFC PATCH] mm/workingset : judge file page activity via timestamp

On Wed 17-04-19 18:55:15, Zhaoyang Huang wrote:
> fix one mailbox and update for some information
>
> Comparing to http://lkml.kernel.org/r/[email protected],
> this commit fix the packing order error and add trace_printk for
> reference debug information.
>
> For johannes's comments, please find bellowing for my feedback.

OK, this suggests there is no strong reason to poset a new version of
the patch then. Please do not fragment discussion and continue
discussing in the original email thread until there is some conclusion
reached.

Thanks!
--
Michal Hocko
SUSE Labs

2019-04-17 11:39:00

by Zhaoyang Huang

[permalink] [raw]
Subject: Re: [RFC PATCH] mm/workingset : judge file page activity via timestamp

sorry for the confusion. What I mean is the basic idea doesn't change
as replacing the refault criteria from refault_distance to timestamp.
But the detailed implementation changed a lot, including fix bugs,
update the way of packing the timestamp, 32bit/64bit differentiation
etc. So it makes sense for starting a new context.

On Wed, Apr 17, 2019 at 7:06 PM Michal Hocko <[email protected]> wrote:
>
> On Wed 17-04-19 18:55:15, Zhaoyang Huang wrote:
> > fix one mailbox and update for some information
> >
> > Comparing to http://lkml.kernel.org/r/[email protected],
> > this commit fix the packing order error and add trace_printk for
> > reference debug information.
> >
> > For johannes's comments, please find bellowing for my feedback.
>
> OK, this suggests there is no strong reason to poset a new version of
> the patch then. Please do not fragment discussion and continue
> discussing in the original email thread until there is some conclusion
> reached.
>
> Thanks!
> --
> Michal Hocko
> SUSE Labs

2019-04-17 11:48:50

by Michal Hocko

[permalink] [raw]
Subject: Re: [RFC PATCH] mm/workingset : judge file page activity via timestamp

On Wed 17-04-19 19:36:21, Zhaoyang Huang wrote:
> sorry for the confusion. What I mean is the basic idea doesn't change
> as replacing the refault criteria from refault_distance to timestamp.
> But the detailed implementation changed a lot, including fix bugs,
> update the way of packing the timestamp, 32bit/64bit differentiation
> etc. So it makes sense for starting a new context.

Not really. My take away from the previous discussion is that Johannes
has questioned the timestamping approach itself. I wasn't following very
closely so I might be wrong here but if that is really the case then it
doesn't make much sense to improve the implementation if there is no
consensus on the approach itself.

--
Michal Hocko
SUSE Labs

2019-04-17 12:28:04

by Zhaoyang Huang

[permalink] [raw]
Subject: Re: [RFC PATCH] mm/workingset : judge file page activity via timestamp

repost the feedback by under Johannes's comment
When something like a higher-order allocation drops a large number of
file pages, it's *intentional* that the pages that were evicted before
them become less valuable and less likely to be activated on refault.
There is a finite amount of in-memory LRU space and the pages that
have been evicted the most recently have precedence because they have
the highest proven access frequency.
[HZY]: Yes. I do agree with you about the original thought of
sacrificing long distance access pages when huge memory demands arise.
The problem is what is the criteria of selecting the page, which you
can find from what I comment in the patch, that is, some pages have
long refault_distance while having a very short access time in
between.

Of course, when a large amount of the cache that was pushed out in
between is not re-used again, and don't claim their space in memory,
it would be great if we could then activate the older pages that *are*
re-used again in their stead.But that would require us being able to
look into the future. When an old page refaults, we don't know if a
younger page is still going to refault with a shorter refault distance
or not. If it won't, then we were right to activate it. If it will
refault, then we put something on the active list whose reuse
frequency is too low to be able to fit into memory, and we thrash the
hottest pages in the system.
[HZY]: We do NOT use the absolute timestamp when page refaulting to
indicate young or old of the page and thus to decide the position of
LRU. The criteria which i use is to comparing the "time duration of
the page's out of cache" and "the active files shrinking time by
dividing average refault ratio". I inherite the concept of deeming
ACTIVE file as deficit of INACTIVE files, but use time to avoid the
scenario as suggested in patch's [1].

As Matthew says, you are fairly randomly making refault activations
more aggressive (especially with that timestamp unpacking bug), and
while that expectedly boosts workload transition / startup, it comes
at the cost of disrupting stable states because you can flood a very
active in-ram workingset with completely cold cache pages simply
because they refault uniformly wrt each other.
[HZY]: I analysis the log got from trace_printk, what we activate have
proven record of long refault distance but very short refault time.

On Wed, Apr 17, 2019 at 7:46 PM Michal Hocko <[email protected]> wrote:
>
> On Wed 17-04-19 19:36:21, Zhaoyang Huang wrote:
> > sorry for the confusion. What I mean is the basic idea doesn't change
> > as replacing the refault criteria from refault_distance to timestamp.
> > But the detailed implementation changed a lot, including fix bugs,
> > update the way of packing the timestamp, 32bit/64bit differentiation
> > etc. So it makes sense for starting a new context.
>
> Not really. My take away from the previous discussion is that Johannes
> has questioned the timestamping approach itself. I wasn't following very
> closely so I might be wrong here but if that is really the case then it
> doesn't make much sense to improve the implementation if there is no
> consensus on the approach itself.
>
> --
> Michal Hocko
> SUSE Labs

2019-04-17 13:01:12

by Michal Hocko

[permalink] [raw]
Subject: Re: [RFC PATCH] mm/workingset : judge file page activity via timestamp

On Wed 17-04-19 20:26:22, Zhaoyang Huang wrote:
> repost the feedback by under Johannes's comment

Please follow up in the original email thread. Fragmenting the
discussion is exactly what I wanted...
--
Michal Hocko
SUSE Labs

2019-04-17 13:38:51

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [RFC PATCH] mm/workingset : judge file page activity via timestamp

On Wed, Apr 17, 2019 at 08:26:22PM +0800, Zhaoyang Huang wrote:
[quoting Johannes here]
> As Matthew says, you are fairly randomly making refault activations
> more aggressive (especially with that timestamp unpacking bug), and
> while that expectedly boosts workload transition / startup, it comes
> at the cost of disrupting stable states because you can flood a very
> active in-ram workingset with completely cold cache pages simply
> because they refault uniformly wrt each other.
> [HZY]: I analysis the log got from trace_printk, what we activate have
> proven record of long refault distance but very short refault time.

You haven't addressed my point, which is that you were only testing
workloads for which your changed algorithm would improve the results.
What you haven't done is shown how other workloads would be negatively
affected.

Once you do that, we can make a decision about whether to improve your
workload by X% and penalise that other workload by Y%.

2019-04-23 11:46:23

by Zhaoyang Huang

[permalink] [raw]
Subject: Re: [RFC PATCH] mm/workingset : judge file page activity via timestamp

rebase the commit to latest mainline and update the code.
@Matthew, with regarding to your comment, I would like to say the
algorithm doesn't change at all. I do NOT judge the page's activity
via an absolute time value, but still the refault distance. What I
want to fix is the scenario which drop lots of file pages on this lru
that leading to a big refault_distance(inactive_age) and inactivate
the page. I haven't found regression of the commit yet. Could you
please suggest me more test cases? Thank you!
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fba7741..ca4ced6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -242,6 +242,7 @@ struct lruvec {
atomic_long_t inactive_age;
/* Refaults at the time of last reclaim cycle */
unsigned long refaults;
+ atomic_long_t refaults_ratio;
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
#endif
diff --git a/mm/workingset.c b/mm/workingset.c
index 0bedf67..95683c1 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -171,6 +171,15 @@
1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)

+#ifdef CONFIG_64BIT
+#define EVICTION_SECS_POS_SHIFT 19
+#define EVICTION_SECS_SHRINK_SHIFT 4
+#define EVICTION_SECS_POS_MASK ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
+#else
+#define EVICTION_SECS_POS_SHIFT 0
+#define EVICTION_SECS_SHRINK_SHIFT 0
+#define NO_SECS_IN_WORKINGSET
+#endif
/*
* Eviction timestamps need to be able to cover the full range of
* actionable refaults. However, bits are tight in the xarray
@@ -180,12 +189,48 @@
* evictions into coarser buckets by shaving off lower timestamp bits.
*/
static unsigned int bucket_order __read_mostly;
-
+#ifdef NO_SECS_IN_WORKINGSET
+static void pack_secs(unsigned long *peviction) { }
+static unsigned int unpack_secs(unsigned long entry) {return 0; }
+#else
+static void pack_secs(unsigned long *peviction)
+{
+ unsigned int secs;
+ unsigned long eviction;
+ int order;
+ int secs_shrink_size;
+ struct timespec64 ts;
+
+ ktime_get_boottime_ts64(&ts);
+ secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1;
+ order = get_count_order(secs);
+ secs_shrink_size = (order <= EVICTION_SECS_POS_SHIFT)
+ ? 0 : (order - EVICTION_SECS_POS_SHIFT);
+
+ eviction = *peviction;
+ eviction = (eviction << EVICTION_SECS_POS_SHIFT)
+ | ((secs >> secs_shrink_size) & EVICTION_SECS_POS_MASK);
+ eviction = (eviction << EVICTION_SECS_SHRINK_SHIFT) |
(secs_shrink_size & 0xf);
+ *peviction = eviction;
+}
+static unsigned int unpack_secs(unsigned long entry)
+{
+ unsigned int secs;
+ int secs_shrink_size;
+
+ secs_shrink_size = entry & ((1 << EVICTION_SECS_SHRINK_SHIFT) - 1);
+ entry >>= EVICTION_SECS_SHRINK_SHIFT;
+ secs = entry & EVICTION_SECS_POS_MASK;
+ secs = secs << secs_shrink_size;
+ return secs;
+}
+#endif
static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
bool workingset)
{
eviction >>= bucket_order;
eviction &= EVICTION_MASK;
+ pack_secs(&eviction);
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
eviction = (eviction << 1) | workingset;
@@ -194,11 +239,12 @@ static void *pack_shadow(int memcgid, pg_data_t
*pgdat, unsigned long eviction,
}

static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
- unsigned long *evictionp, bool *workingsetp)
+ unsigned long *evictionp, bool *workingsetp, unsigned int *prev_secs)
{
unsigned long entry = xa_to_value(shadow);
int memcgid, nid;
bool workingset;
+ unsigned int secs;

workingset = entry & 1;
entry >>= 1;
@@ -206,11 +252,14 @@ static void unpack_shadow(void *shadow, int
*memcgidp, pg_data_t **pgdat,
entry >>= NODES_SHIFT;
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
entry >>= MEM_CGROUP_ID_SHIFT;
+ secs = unpack_secs(entry);
+ entry >>= (EVICTION_SECS_POS_SHIFT + EVICTION_SECS_SHRINK_SHIFT);

*memcgidp = memcgid;
*pgdat = NODE_DATA(nid);
*evictionp = entry << bucket_order;
*workingsetp = workingset;
+ *prev_secs = secs;
}

/**
@@ -257,8 +306,22 @@ void workingset_refault(struct page *page, void *shadow)
unsigned long refault;
bool workingset;
int memcgid;
+#ifndef NO_SECS_IN_WORKINGSET
+ unsigned long avg_refault_time;
+ unsigned long refaults_ratio;
+ unsigned long refault_time;
+ int tradition;
+ unsigned int prev_secs;
+ unsigned int secs;
+#endif
+ struct timespec64 ts;
+ /*
+ convert jiffies to second
+ */
+ ktime_get_boottime_ts64(&ts);
+ secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1;

- unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset, &prev_secs);

rcu_read_lock();
/*
@@ -303,23 +366,58 @@ void workingset_refault(struct page *page, void *shadow)
refault_distance = (refault - eviction) & EVICTION_MASK;

inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
-
+#ifndef NO_SECS_IN_WORKINGSET
+ refaults_ratio = (atomic_long_read(&lruvec->inactive_age) + 1) / secs;
+ atomic_long_set(&lruvec->refaults_ratio, refaults_ratio);
+ refault_time = secs - prev_secs;
+ avg_refault_time = active_file / refaults_ratio;
+ tradition = !!(refault_distance < active_file);
/*
- * Compare the distance to the existing workingset size. We
- * don't act on pages that couldn't stay resident even if all
- * the memory was available to the page cache.
+ * What we are trying to solve here is
+ * 1. extremely fast refault as refault_time == 0.
+ * 2. quick file drop scenario, which has a big refault_distance but
+ * small refault_time comparing with the past refault ratio, which
+ * will be deemed as inactive in previous implementation.
*/
- if (refault_distance > active_file)
+ if (refault_time && (((refault_time < avg_refault_time)
+ && (avg_refault_time < 2 * refault_time))
+ || (refault_time >= avg_refault_time))) {
+ trace_printk("WKST_INACT[%d]:rft_dis %ld, act %ld\
+ rft_ratio %ld rft_time %ld avg_rft_time %ld\
+ refault %ld eviction %ld secs %d pre_secs %d page %p\n",
+ tradition, refault_distance, active_file,
+ refaults_ratio, refault_time, avg_refault_time,
+ refault, eviction, secs, prev_secs, page);
goto out;
+ }
+ else {
+#else
+ if (refault_distance < active_file) {
+#endif

- SetPageActive(page);
- atomic_long_inc(&lruvec->inactive_age);
- inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+ /*
+ * Compare the distance to the existing workingset size. We
+ * don't act on pages that couldn't stay resident even if all
+ * the memory was available to the page cache.
+ */

- /* Page was active prior to eviction */
- if (workingset) {
- SetPageWorkingset(page);
- inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
+ SetPageActive(page);
+ atomic_long_inc(&lruvec->inactive_age);
+ inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+
+ /* Page was active prior to eviction */
+ if (workingset) {
+ SetPageWorkingset(page);
+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
+ }
+#ifndef NO_SECS_IN_WORKINGSET
+ trace_printk("WKST_ACT[%d]:rft_dis %ld, act %ld\
+ rft_ratio %ld rft_time %ld avg_rft_time %ld\
+ refault %ld eviction %ld secs %d pre_secs %d page %p\n",
+ tradition, refault_distance, active_file,
+ refaults_ratio, refault_time, avg_refault_time,
+ refault, eviction, secs, prev_secs, page);
+#endif
}
out:
rcu_read_unlock();
@@ -539,7 +637,9 @@ static int __init workingset_init(void)
unsigned int max_order;
int ret;

- BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
+ BUILD_BUG_ON(BITS_PER_LONG < (EVICTION_SHIFT
+ + EVICTION_SECS_POS_SHIFT
+ + EVICTION_SECS_SHRINK_SHIFT));
/*
* Calculate the eviction bucket size to cover the longest
* actionable refault distance, which is currently half of
@@ -547,7 +647,9 @@ static int __init workingset_init(void)
* some more pages at runtime, so keep working with up to
* double the initial memory by using totalram_pages as-is.
*/
- timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
+ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT
+ - EVICTION_SECS_POS_SHIFT - EVICTION_SECS_SHRINK_SHIFT;
+
max_order = fls_long(totalram_pages() - 1);
if (max_order > timestamp_bits)
bucket_order = max_order - timestamp_bits;

On Wed, Apr 17, 2019 at 9:37 PM Matthew Wilcox <[email protected]> wrote:
>
> On Wed, Apr 17, 2019 at 08:26:22PM +0800, Zhaoyang Huang wrote:
> [quoting Johannes here]
> > As Matthew says, you are fairly randomly making refault activations
> > more aggressive (especially with that timestamp unpacking bug), and
> > while that expectedly boosts workload transition / startup, it comes
> > at the cost of disrupting stable states because you can flood a very
> > active in-ram workingset with completely cold cache pages simply
> > because they refault uniformly wrt each other.
> > [HZY]: I analysis the log got from trace_printk, what we activate have
> > proven record of long refault distance but very short refault time.
>
> You haven't addressed my point, which is that you were only testing
> workloads for which your changed algorithm would improve the results.
> What you haven't done is shown how other workloads would be negatively
> affected.
>
> Once you do that, we can make a decision about whether to improve your
> workload by X% and penalise that other workload by Y%.