When usage exceeds min, min usage should be min other than 0.
Apply the same for low.
Signed-off-by: Xunlei Pang <[email protected]>
---
mm/page_counter.c | 12 ++----------
1 file changed, 2 insertions(+), 10 deletions(-)
diff --git a/mm/page_counter.c b/mm/page_counter.c
index de31470655f6..75d53f15f040 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -23,11 +23,7 @@ static void propagate_protected_usage(struct page_counter *c,
return;
if (c->min || atomic_long_read(&c->min_usage)) {
- if (usage <= c->min)
- protected = usage;
- else
- protected = 0;
-
+ protected = min(usage, c->min);
old_protected = atomic_long_xchg(&c->min_usage, protected);
delta = protected - old_protected;
if (delta)
@@ -35,11 +31,7 @@ static void propagate_protected_usage(struct page_counter *c,
}
if (c->low || atomic_long_read(&c->low_usage)) {
- if (usage <= c->low)
- protected = usage;
- else
- protected = 0;
-
+ protected = min(usage, c->low);
old_protected = atomic_long_xchg(&c->low_usage, protected);
delta = protected - old_protected;
if (delta)
--
2.13.5 (Apple Git-94)
There may be cgroup memory overcommitment, it will become
even common in the future.
Let's enable kswapd to reclaim low-protected memory in case
of memory pressure, to mitigate the global direct reclaim
pressures which could cause jitters to the response time of
lantency-sensitive groups.
Signed-off-by: Xunlei Pang <[email protected]>
---
mm/vmscan.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 62ac0c488624..3d412eb91f73 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3531,6 +3531,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
count_vm_event(PAGEOUTRUN);
+retry:
do {
unsigned long nr_reclaimed = sc.nr_reclaimed;
bool raise_priority = true;
@@ -3622,6 +3623,13 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
sc.priority--;
} while (sc.priority >= 1);
+ if (!sc.nr_reclaimed && sc.memcg_low_skipped) {
+ sc.priority = DEF_PRIORITY;
+ sc.memcg_low_reclaim = 1;
+ sc.memcg_low_skipped = 0;
+ goto retry;
+ }
+
if (!sc.nr_reclaimed)
pgdat->kswapd_failures++;
--
2.13.5 (Apple Git-94)
When memcgs get reclaimed after its usage exceeds min, some
usages below the min may also be reclaimed in the current
implementation, the amount is considerably large during kswapd
reclaim according to my ftrace results.
This patch calculates the part over hard protection limit,
and allows only this part of usages to be reclaimed.
Signed-off-by: Xunlei Pang <[email protected]>
---
include/linux/memcontrol.h | 7 +++++--
mm/memcontrol.c | 9 +++++++--
mm/vmscan.c | 17 +++++++++++++++--
3 files changed, 27 insertions(+), 6 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7ab2120155a4..637ef975792f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -334,7 +334,8 @@ static inline bool mem_cgroup_disabled(void)
}
enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
- struct mem_cgroup *memcg);
+ struct mem_cgroup *memcg,
+ unsigned long *min_excess);
int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask, struct mem_cgroup **memcgp,
@@ -818,7 +819,9 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
}
static inline enum mem_cgroup_protection mem_cgroup_protected(
- struct mem_cgroup *root, struct mem_cgroup *memcg)
+ struct mem_cgroup *root,
+ struct mem_cgroup *memcg,
+ unsigned long *min_excess)
{
return MEMCG_PROT_NONE;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6e1469b80cb7..ca96f68e07a0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5694,6 +5694,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
* mem_cgroup_protected - check if memory consumption is in the normal range
* @root: the top ancestor of the sub-tree being checked
* @memcg: the memory cgroup to check
+ * @min_excess: store the number of pages exceeding hard protection
*
* WARNING: This function is not stateless! It can only be used as part
* of a top-down tree iteration, not for isolated queries.
@@ -5761,7 +5762,8 @@ struct cgroup_subsys memory_cgrp_subsys = {
* as memory.low is a best-effort mechanism.
*/
enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
- struct mem_cgroup *memcg)
+ struct mem_cgroup *memcg,
+ unsigned long *min_excess)
{
struct mem_cgroup *parent;
unsigned long emin, parent_emin;
@@ -5827,8 +5829,11 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
return MEMCG_PROT_MIN;
else if (usage <= elow)
return MEMCG_PROT_LOW;
- else
+ else {
+ if (emin)
+ *min_excess = usage - emin;
return MEMCG_PROT_NONE;
+ }
}
/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3d412eb91f73..e4fa7a2a63d0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -66,6 +66,9 @@ struct scan_control {
/* How many pages shrink_list() should reclaim */
unsigned long nr_to_reclaim;
+ /* How many pages hard protection allows */
+ unsigned long min_excess;
+
/*
* Nodemask of nodes allowed by the caller. If NULL, all nodes
* are scanned.
@@ -2503,10 +2506,14 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
unsigned long nr_to_scan;
enum lru_list lru;
unsigned long nr_reclaimed = 0;
- unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+ unsigned long nr_to_reclaim;
struct blk_plug plug;
bool scan_adjusted;
+ nr_to_reclaim = sc->nr_to_reclaim;
+ if (sc->min_excess)
+ nr_to_reclaim = min(nr_to_reclaim, sc->min_excess);
+
get_scan_count(lruvec, memcg, sc, nr, lru_pages);
/* Record the original scan target for proportional adjustments later */
@@ -2544,6 +2551,10 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
cond_resched();
+ /* Abort proportional reclaim when hard protection applies */
+ if (sc->min_excess && nr_reclaimed >= sc->min_excess)
+ break;
+
if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
continue;
@@ -2725,8 +2736,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
unsigned long lru_pages;
unsigned long reclaimed;
unsigned long scanned;
+ unsigned long excess = 0;
- switch (mem_cgroup_protected(root, memcg)) {
+ switch (mem_cgroup_protected(root, memcg, &excess)) {
case MEMCG_PROT_MIN:
/*
* Hard protection.
@@ -2752,6 +2764,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
reclaimed = sc->nr_reclaimed;
scanned = sc->nr_scanned;
+ sc->min_excess = excess;
shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
node_lru_pages += lru_pages;
--
2.13.5 (Apple Git-94)
On Mon 03-12-18 16:01:17, Xunlei Pang wrote:
> When usage exceeds min, min usage should be min other than 0.
> Apply the same for low.
Why? What is the actual problem.
> Signed-off-by: Xunlei Pang <[email protected]>
> ---
> mm/page_counter.c | 12 ++----------
> 1 file changed, 2 insertions(+), 10 deletions(-)
>
> diff --git a/mm/page_counter.c b/mm/page_counter.c
> index de31470655f6..75d53f15f040 100644
> --- a/mm/page_counter.c
> +++ b/mm/page_counter.c
> @@ -23,11 +23,7 @@ static void propagate_protected_usage(struct page_counter *c,
> return;
>
> if (c->min || atomic_long_read(&c->min_usage)) {
> - if (usage <= c->min)
> - protected = usage;
> - else
> - protected = 0;
> -
> + protected = min(usage, c->min);
> old_protected = atomic_long_xchg(&c->min_usage, protected);
> delta = protected - old_protected;
> if (delta)
> @@ -35,11 +31,7 @@ static void propagate_protected_usage(struct page_counter *c,
> }
>
> if (c->low || atomic_long_read(&c->low_usage)) {
> - if (usage <= c->low)
> - protected = usage;
> - else
> - protected = 0;
> -
> + protected = min(usage, c->low);
> old_protected = atomic_long_xchg(&c->low_usage, protected);
> delta = protected - old_protected;
> if (delta)
> --
> 2.13.5 (Apple Git-94)
>
--
Michal Hocko
SUSE Labs
On Mon 03-12-18 16:01:19, Xunlei Pang wrote:
> When memcgs get reclaimed after its usage exceeds min, some
> usages below the min may also be reclaimed in the current
> implementation, the amount is considerably large during kswapd
> reclaim according to my ftrace results.
And here again. Describe the setup and the behavior please?
--
Michal Hocko
SUSE Labs
On Mon 03-12-18 16:01:18, Xunlei Pang wrote:
> There may be cgroup memory overcommitment, it will become
> even common in the future.
>
> Let's enable kswapd to reclaim low-protected memory in case
> of memory pressure, to mitigate the global direct reclaim
> pressures which could cause jitters to the response time of
> lantency-sensitive groups.
Please be more descriptive about the problem you are trying to handle
here. I haven't actually read the patch but let me emphasise that the
low limit protection is important isolation tool. And allowing kswapd to
reclaim protected memcgs is going to break the semantic as it has been
introduced and designed.
>
> Signed-off-by: Xunlei Pang <[email protected]>
> ---
> mm/vmscan.c | 8 ++++++++
> 1 file changed, 8 insertions(+)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 62ac0c488624..3d412eb91f73 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3531,6 +3531,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
>
> count_vm_event(PAGEOUTRUN);
>
> +retry:
> do {
> unsigned long nr_reclaimed = sc.nr_reclaimed;
> bool raise_priority = true;
> @@ -3622,6 +3623,13 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
> sc.priority--;
> } while (sc.priority >= 1);
>
> + if (!sc.nr_reclaimed && sc.memcg_low_skipped) {
> + sc.priority = DEF_PRIORITY;
> + sc.memcg_low_reclaim = 1;
> + sc.memcg_low_skipped = 0;
> + goto retry;
> + }
> +
> if (!sc.nr_reclaimed)
> pgdat->kswapd_failures++;
>
> --
> 2.13.5 (Apple Git-94)
>
--
Michal Hocko
SUSE Labs
On 2018/12/3 ????7:54, Michal Hocko wrote:
> On Mon 03-12-18 16:01:17, Xunlei Pang wrote:
>> When usage exceeds min, min usage should be min other than 0.
>> Apply the same for low.
>
> Why? What is the actual problem.
children_min_usage tracks the total children usages under min,
it's natural that min should be added into children_min_usage
when above min, I can't image why 0 is added, is there special
history I missed?
See mem_cgroup_protected(), when usage exceeds min, emin is
calculated as "parent_emin*min/children_min_usage".
>
>> Signed-off-by: Xunlei Pang <[email protected]>
>> ---
>> mm/page_counter.c | 12 ++----------
>> 1 file changed, 2 insertions(+), 10 deletions(-)
>>
>> diff --git a/mm/page_counter.c b/mm/page_counter.c
>> index de31470655f6..75d53f15f040 100644
>> --- a/mm/page_counter.c
>> +++ b/mm/page_counter.c
>> @@ -23,11 +23,7 @@ static void propagate_protected_usage(struct page_counter *c,
>> return;
>>
>> if (c->min || atomic_long_read(&c->min_usage)) {
>> - if (usage <= c->min)
>> - protected = usage;
>> - else
>> - protected = 0;
>> -
>> + protected = min(usage, c->min);
>> old_protected = atomic_long_xchg(&c->min_usage, protected);
>> delta = protected - old_protected;
>> if (delta)
>> @@ -35,11 +31,7 @@ static void propagate_protected_usage(struct page_counter *c,
>> }
>>
>> if (c->low || atomic_long_read(&c->low_usage)) {
>> - if (usage <= c->low)
>> - protected = usage;
>> - else
>> - protected = 0;
>> -
>> + protected = min(usage, c->low);
>> old_protected = atomic_long_xchg(&c->low_usage, protected);
>> delta = protected - old_protected;
>> if (delta)
>> --
>> 2.13.5 (Apple Git-94)
>>
>
On 2018/12/3 ????7:56, Michal Hocko wrote:
> On Mon 03-12-18 16:01:18, Xunlei Pang wrote:
>> There may be cgroup memory overcommitment, it will become
>> even common in the future.
>>
>> Let's enable kswapd to reclaim low-protected memory in case
>> of memory pressure, to mitigate the global direct reclaim
>> pressures which could cause jitters to the response time of
>> lantency-sensitive groups.
>
> Please be more descriptive about the problem you are trying to handle
> here. I haven't actually read the patch but let me emphasise that the
> low limit protection is important isolation tool. And allowing kswapd to
> reclaim protected memcgs is going to break the semantic as it has been
> introduced and designed.
We have two types of memcgs: online groups(important business)
and offline groups(unimportant business). Online groups are
all configured with MAX low protection, while offline groups
are not at all protected(with default 0 low).
When offline groups are overcommitted, the global memory pressure
suffers. This will cause the memory allocations from online groups
constantly go to the slow global direct reclaim in order to reclaim
online's page caches, as kswap is not able to reclaim low-protection
memory. low is not hard limit, it's reasonable to be reclaimed by
kswapd if there's no other reclaimable memory.
>
>>
>> Signed-off-by: Xunlei Pang <[email protected]>
>> ---
>> mm/vmscan.c | 8 ++++++++
>> 1 file changed, 8 insertions(+)
>>
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 62ac0c488624..3d412eb91f73 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -3531,6 +3531,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
>>
>> count_vm_event(PAGEOUTRUN);
>>
>> +retry:
>> do {
>> unsigned long nr_reclaimed = sc.nr_reclaimed;
>> bool raise_priority = true;
>> @@ -3622,6 +3623,13 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
>> sc.priority--;
>> } while (sc.priority >= 1);
>>
>> + if (!sc.nr_reclaimed && sc.memcg_low_skipped) {
>> + sc.priority = DEF_PRIORITY;
>> + sc.memcg_low_reclaim = 1;
>> + sc.memcg_low_skipped = 0;
>> + goto retry;
>> + }
>> +
>> if (!sc.nr_reclaimed)
>> pgdat->kswapd_failures++;
>>
>> --
>> 2.13.5 (Apple Git-94)
>>
>
On Mon 03-12-18 23:20:31, Xunlei Pang wrote:
> On 2018/12/3 下午7:56, Michal Hocko wrote:
> > On Mon 03-12-18 16:01:18, Xunlei Pang wrote:
> >> There may be cgroup memory overcommitment, it will become
> >> even common in the future.
> >>
> >> Let's enable kswapd to reclaim low-protected memory in case
> >> of memory pressure, to mitigate the global direct reclaim
> >> pressures which could cause jitters to the response time of
> >> lantency-sensitive groups.
> >
> > Please be more descriptive about the problem you are trying to handle
> > here. I haven't actually read the patch but let me emphasise that the
> > low limit protection is important isolation tool. And allowing kswapd to
> > reclaim protected memcgs is going to break the semantic as it has been
> > introduced and designed.
>
> We have two types of memcgs: online groups(important business)
> and offline groups(unimportant business). Online groups are
> all configured with MAX low protection, while offline groups
> are not at all protected(with default 0 low).
>
> When offline groups are overcommitted, the global memory pressure
> suffers. This will cause the memory allocations from online groups
> constantly go to the slow global direct reclaim in order to reclaim
> online's page caches, as kswap is not able to reclaim low-protection
> memory. low is not hard limit, it's reasonable to be reclaimed by
> kswapd if there's no other reclaimable memory.
I am sorry I still do not follow. What role do offline cgroups play.
Those are certainly not low mem protected because mem_cgroup_css_offline
will reset them to 0.
--
Michal Hocko
SUSE Labs
On Mon, Dec 03, 2018 at 04:01:17PM +0800, Xunlei Pang wrote:
> When usage exceeds min, min usage should be min other than 0.
> Apply the same for low.
>
> Signed-off-by: Xunlei Pang <[email protected]>
> ---
> mm/page_counter.c | 12 ++----------
> 1 file changed, 2 insertions(+), 10 deletions(-)
>
> diff --git a/mm/page_counter.c b/mm/page_counter.c
> index de31470655f6..75d53f15f040 100644
> --- a/mm/page_counter.c
> +++ b/mm/page_counter.c
> @@ -23,11 +23,7 @@ static void propagate_protected_usage(struct page_counter *c,
> return;
>
> if (c->min || atomic_long_read(&c->min_usage)) {
> - if (usage <= c->min)
> - protected = usage;
> - else
> - protected = 0;
> -
> + protected = min(usage, c->min);
This change makes sense in the combination with the patch 3, but not as a
standlone "fix". It's not a bug, it's a required thing unless you start scanning
proportionally to memory.low/min excess.
Please, reflect this in the commit message. Or, even better, merge it into
the patch 3.
Also, please, make sure that cgroup kselftests are passing after your changes.
Thanks!
On 2018/12/4 AM 1:22, Michal Hocko wrote:
> On Mon 03-12-18 23:20:31, Xunlei Pang wrote:
>> On 2018/12/3 下午7:56, Michal Hocko wrote:
>>> On Mon 03-12-18 16:01:18, Xunlei Pang wrote:
>>>> There may be cgroup memory overcommitment, it will become
>>>> even common in the future.
>>>>
>>>> Let's enable kswapd to reclaim low-protected memory in case
>>>> of memory pressure, to mitigate the global direct reclaim
>>>> pressures which could cause jitters to the response time of
>>>> lantency-sensitive groups.
>>>
>>> Please be more descriptive about the problem you are trying to handle
>>> here. I haven't actually read the patch but let me emphasise that the
>>> low limit protection is important isolation tool. And allowing kswapd to
>>> reclaim protected memcgs is going to break the semantic as it has been
>>> introduced and designed.
>>
>> We have two types of memcgs: online groups(important business)
>> and offline groups(unimportant business). Online groups are
>> all configured with MAX low protection, while offline groups
>> are not at all protected(with default 0 low).
>>
>> When offline groups are overcommitted, the global memory pressure
>> suffers. This will cause the memory allocations from online groups
>> constantly go to the slow global direct reclaim in order to reclaim
>> online's page caches, as kswap is not able to reclaim low-protection
>> memory. low is not hard limit, it's reasonable to be reclaimed by
>> kswapd if there's no other reclaimable memory.
>
> I am sorry I still do not follow. What role do offline cgroups play.
> Those are certainly not low mem protected because mem_cgroup_css_offline
> will reset them to 0.
>
Oh, I meant "offline groups" to be "offline-business groups", memcgs
refered to here are all "online state" from kernel's perspective.
On 2018/12/3 PM 7:57, Michal Hocko wrote:
> On Mon 03-12-18 16:01:19, Xunlei Pang wrote:
>> When memcgs get reclaimed after its usage exceeds min, some
>> usages below the min may also be reclaimed in the current
>> implementation, the amount is considerably large during kswapd
>> reclaim according to my ftrace results.
>
> And here again. Describe the setup and the behavior please?
>
step 1
mkdir -p /sys/fs/cgroup/memory/online
cd /sys/fs/cgroup/memory/online
echo 512M > memory.max
echo 409600000 > memory.min
echo $$ > tasks
dd if=/dev/sda of=/dev/null
while true; do sleep 1; cat memory.current ; cat memory.min; done
step 2
create global memory pressure by allocating annoymous and cached
pages to constantly trigger kswap: dd if=/dev/sdb of=/dev/null
step 3
Then observe "online" groups, hundreds of kbytes a little over
memory.min can cause tens of MiB to be reclaimed by kswapd.
Here is one of test results I got:
cat memory.current; cat memory.min; echo;
409485312 // current
409600000 // min
385052672 // See current got over reclaimed for 23MB
409600000 // min
Its corresponding ftrace output I monitored:
kswapd_0-281 [000] .... 304.706632: shrink_node_memcg:
min_excess=24, nr_reclaimed=6013, sc->nr_to_reclaim=1499997, exceeds
5989pages
On Tue 04-12-18 10:40:29, Xunlei Pang wrote:
> On 2018/12/4 AM 1:22, Michal Hocko wrote:
> > On Mon 03-12-18 23:20:31, Xunlei Pang wrote:
> >> On 2018/12/3 下午7:56, Michal Hocko wrote:
> >>> On Mon 03-12-18 16:01:18, Xunlei Pang wrote:
> >>>> There may be cgroup memory overcommitment, it will become
> >>>> even common in the future.
> >>>>
> >>>> Let's enable kswapd to reclaim low-protected memory in case
> >>>> of memory pressure, to mitigate the global direct reclaim
> >>>> pressures which could cause jitters to the response time of
> >>>> lantency-sensitive groups.
> >>>
> >>> Please be more descriptive about the problem you are trying to handle
> >>> here. I haven't actually read the patch but let me emphasise that the
> >>> low limit protection is important isolation tool. And allowing kswapd to
> >>> reclaim protected memcgs is going to break the semantic as it has been
> >>> introduced and designed.
> >>
> >> We have two types of memcgs: online groups(important business)
> >> and offline groups(unimportant business). Online groups are
> >> all configured with MAX low protection, while offline groups
> >> are not at all protected(with default 0 low).
> >>
> >> When offline groups are overcommitted, the global memory pressure
> >> suffers. This will cause the memory allocations from online groups
> >> constantly go to the slow global direct reclaim in order to reclaim
> >> online's page caches, as kswap is not able to reclaim low-protection
> >> memory. low is not hard limit, it's reasonable to be reclaimed by
> >> kswapd if there's no other reclaimable memory.
> >
> > I am sorry I still do not follow. What role do offline cgroups play.
> > Those are certainly not low mem protected because mem_cgroup_css_offline
> > will reset them to 0.
> >
>
> Oh, I meant "offline groups" to be "offline-business groups", memcgs
> refered to here are all "online state" from kernel's perspective.
What is offline-business group? Please try to explain the actual problem
in much more details and do not let us guess.
--
Michal Hocko
SUSE Labs
On 2018/12/4 PM 3:25, Michal Hocko wrote:
> On Tue 04-12-18 10:40:29, Xunlei Pang wrote:
>> On 2018/12/4 AM 1:22, Michal Hocko wrote:
>>> On Mon 03-12-18 23:20:31, Xunlei Pang wrote:
>>>> On 2018/12/3 下午7:56, Michal Hocko wrote:
>>>>> On Mon 03-12-18 16:01:18, Xunlei Pang wrote:
>>>>>> There may be cgroup memory overcommitment, it will become
>>>>>> even common in the future.
>>>>>>
>>>>>> Let's enable kswapd to reclaim low-protected memory in case
>>>>>> of memory pressure, to mitigate the global direct reclaim
>>>>>> pressures which could cause jitters to the response time of
>>>>>> lantency-sensitive groups.
>>>>>
>>>>> Please be more descriptive about the problem you are trying to handle
>>>>> here. I haven't actually read the patch but let me emphasise that the
>>>>> low limit protection is important isolation tool. And allowing kswapd to
>>>>> reclaim protected memcgs is going to break the semantic as it has been
>>>>> introduced and designed.
>>>>
>>>> We have two types of memcgs: online groups(important business)
>>>> and offline groups(unimportant business). Online groups are
>>>> all configured with MAX low protection, while offline groups
>>>> are not at all protected(with default 0 low).
>>>>
>>>> When offline groups are overcommitted, the global memory pressure
>>>> suffers. This will cause the memory allocations from online groups
>>>> constantly go to the slow global direct reclaim in order to reclaim
>>>> online's page caches, as kswap is not able to reclaim low-protection
>>>> memory. low is not hard limit, it's reasonable to be reclaimed by
>>>> kswapd if there's no other reclaimable memory.
>>>
>>> I am sorry I still do not follow. What role do offline cgroups play.
>>> Those are certainly not low mem protected because mem_cgroup_css_offline
>>> will reset them to 0.
>>>
>>
>> Oh, I meant "offline groups" to be "offline-business groups", memcgs
>> refered to here are all "online state" from kernel's perspective.
>
> What is offline-business group? Please try to explain the actual problem
> in much more details and do not let us guess.
>
Maybe I choosed the wrong word, let me rephase it, and
here is an example.
root 200GB
/ \
important(100GB) unimportant(100GB+DYNAMIC)
/ | \ / \
docker0 docker1... normal(100GB) oversold(DYNAMIC)
/ | \ / | \
j0 j1 ... w0 w1 ...
"DYNAMIC" is controlled by the cluster job scheduler dynamically,
it periodically samples the available system memory(/proc/meminfo
"MemAvailable"), and use part of that to launch oversold jobs
under some special conditions. When "oversold" is active, the
whole system is put under heavy global memory pressure although
memcgs are not.
IOW "DYNAMIC" is primarily borrowed from "dockers" temporarily,
oversold workers will be killed in a timely fashion if "dockers"
needs their memory back suddenly which is rare.
If kswapd doesn't reclaim low-protected memory configured among
"important" dockers, memory allocations from dockers will trap
into global direct reclaim constantly which harms their performance
and response time. The inactive caches from dockers are allowed
to be reclaimed although they are under low-protected(we used a
simple MAX setting), we allow the inactive low-protected memory
to be reclaimed immediately and asynchronously as long as there's
no unprotected reclaimable memory. Its's also friendly to disk IO.
For really latency-sensitive docker, memory.min is supposed to be
used to guarantee its memory QoS.
Thanks
Hi Roman,
On 2018/12/4 AM 2:00, Roman Gushchin wrote:
> On Mon, Dec 03, 2018 at 04:01:17PM +0800, Xunlei Pang wrote:
>> When usage exceeds min, min usage should be min other than 0.
>> Apply the same for low.
>>
>> Signed-off-by: Xunlei Pang <[email protected]>
>> ---
>> mm/page_counter.c | 12 ++----------
>> 1 file changed, 2 insertions(+), 10 deletions(-)
>>
>> diff --git a/mm/page_counter.c b/mm/page_counter.c
>> index de31470655f6..75d53f15f040 100644
>> --- a/mm/page_counter.c
>> +++ b/mm/page_counter.c
>> @@ -23,11 +23,7 @@ static void propagate_protected_usage(struct page_counter *c,
>> return;
>>
>> if (c->min || atomic_long_read(&c->min_usage)) {
>> - if (usage <= c->min)
>> - protected = usage;
>> - else
>> - protected = 0;
>> -
>> + protected = min(usage, c->min);
>
> This change makes sense in the combination with the patch 3, but not as a
> standlone "fix". It's not a bug, it's a required thing unless you start scanning
> proportionally to memory.low/min excess.
>
> Please, reflect this in the commit message. Or, even better, merge it into
> the patch 3.
The more I looked the more I think it's a bug, but anyway I'm fine with
merging it into patch 3 :-)
>
> Also, please, make sure that cgroup kselftests are passing after your changes.
Sure, will do and send v2. Thanks for your inputs.
On Wed, Dec 05, 2018 at 04:58:31PM +0800, Xunlei Pang wrote:
> Hi Roman,
>
> On 2018/12/4 AM 2:00, Roman Gushchin wrote:
> > On Mon, Dec 03, 2018 at 04:01:17PM +0800, Xunlei Pang wrote:
> >> When usage exceeds min, min usage should be min other than 0.
> >> Apply the same for low.
> >>
> >> Signed-off-by: Xunlei Pang <[email protected]>
> >> ---
> >> mm/page_counter.c | 12 ++----------
> >> 1 file changed, 2 insertions(+), 10 deletions(-)
> >>
> >> diff --git a/mm/page_counter.c b/mm/page_counter.c
> >> index de31470655f6..75d53f15f040 100644
> >> --- a/mm/page_counter.c
> >> +++ b/mm/page_counter.c
> >> @@ -23,11 +23,7 @@ static void propagate_protected_usage(struct page_counter *c,
> >> return;
> >>
> >> if (c->min || atomic_long_read(&c->min_usage)) {
> >> - if (usage <= c->min)
> >> - protected = usage;
> >> - else
> >> - protected = 0;
> >> -
> >> + protected = min(usage, c->min);
> >
> > This change makes sense in the combination with the patch 3, but not as a
> > standlone "fix". It's not a bug, it's a required thing unless you start scanning
> > proportionally to memory.low/min excess.
> >
> > Please, reflect this in the commit message. Or, even better, merge it into
> > the patch 3.
>
> The more I looked the more I think it's a bug, but anyway I'm fine with
> merging it into patch 3 :-)
It's not. I've explained it back to the time when we've been discussing that
patch. TL;DR because the decision to scan or to skip is binary now, to
prioritize one cgroup over other it's necessary to do this trick. Otherwise
both cgroups can have their usages above effective memory protections, and
will be scanned with the same pace.
If you have any doubts, you can try to run memcg kselftests with and without
this change, you'll see the difference.
Thanks!