Using the new config option, users can disable SPLIT_RSS_COUNTING to
get increased accuracy in user-visible mm counters.
Signed-off-by: Daniel Colascione <[email protected]>
---
include/linux/mm.h | 4 ++--
include/linux/mm_types_task.h | 5 ++---
include/linux/sched.h | 2 +-
kernel/fork.c | 2 +-
mm/Kconfig | 11 +++++++++++
mm/memory.c | 6 +++---
6 files changed, 20 insertions(+), 10 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cc292273e6ba..221395de3cb4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1637,7 +1637,7 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
long val = atomic_long_read(&mm->rss_stat.count[member]);
-#ifdef SPLIT_RSS_COUNTING
+#ifdef CONFIG_SPLIT_RSS_COUNTING
/*
* counter is updated in asynchronous manner and may go to minus.
* But it's never be expected number for users.
@@ -1723,7 +1723,7 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
*maxrss = hiwater_rss;
}
-#if defined(SPLIT_RSS_COUNTING)
+#ifdef CONFIG_SPLIT_RSS_COUNTING
void sync_mm_rss(struct mm_struct *mm);
#else
static inline void sync_mm_rss(struct mm_struct *mm)
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index c1bc6731125c..d2adc8057e65 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -48,14 +48,13 @@ enum {
NR_MM_COUNTERS
};
-#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
-#define SPLIT_RSS_COUNTING
+#ifdef CONFIG_SPLIT_RSS_COUNTING
/* per-thread cached information, */
struct task_rss_stat {
int events; /* for synchronization threshold */
int count[NR_MM_COUNTERS];
};
-#endif /* USE_SPLIT_PTE_PTLOCKS */
+#endif /* CONFIG_SPLIT_RSS_COUNTING */
struct mm_rss_stat {
atomic_long_t count[NR_MM_COUNTERS];
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c2e56bd8913..22f354774540 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -729,7 +729,7 @@ struct task_struct {
/* Per-thread vma caching: */
struct vmacache vmacache;
-#ifdef SPLIT_RSS_COUNTING
+#ifdef CONFIG_SPLIT_RSS_COUNTING
struct task_rss_stat rss_stat;
#endif
int exit_state;
diff --git a/kernel/fork.c b/kernel/fork.c
index f9572f416126..fc5e0889922b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1917,7 +1917,7 @@ static __latent_entropy struct task_struct *copy_process(
p->vtime.state = VTIME_INACTIVE;
#endif
-#if defined(SPLIT_RSS_COUNTING)
+#ifdef CONFIG_SPLIT_RSS_COUNTING
memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif
diff --git a/mm/Kconfig b/mm/Kconfig
index a5dae9a7eb51..372ef9449924 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -736,4 +736,15 @@ config ARCH_HAS_PTE_SPECIAL
config ARCH_HAS_HUGEPD
bool
+config SPLIT_RSS_COUNTING
+ bool "Per-thread mm counter caching"
+ depends on MMU
+ default y if NR_CPUS >= SPLIT_PTLOCK_CPUS
+ help
+ Cache mm counter updates in thread structures and
+ flush them to visible per-process statistics in batches.
+ Say Y here to slightly reduce cache contention in processes
+ with many threads at the expense of decreasing the accuracy
+ of memory statistics in /proc.
+
endmenu
diff --git a/mm/memory.c b/mm/memory.c
index b1ca51a079f2..bf557ed5ba23 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -141,7 +141,7 @@ static int __init init_zero_pfn(void)
core_initcall(init_zero_pfn);
-#if defined(SPLIT_RSS_COUNTING)
+#ifdef CONFIG_SPLIT_RSS_COUNTING
void sync_mm_rss(struct mm_struct *mm)
{
@@ -177,7 +177,7 @@ static void check_sync_rss_stat(struct task_struct *task)
if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
sync_mm_rss(task->mm);
}
-#else /* SPLIT_RSS_COUNTING */
+#else /* CONFIG_SPLIT_RSS_COUNTING */
#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
@@ -186,7 +186,7 @@ static void check_sync_rss_stat(struct task_struct *task)
{
}
-#endif /* SPLIT_RSS_COUNTING */
+#endif /* CONFIG_SPLIT_RSS_COUNTING */
/*
* Note: this doesn't free the actual pages themselves. That
--
2.23.0.581.g78d2f28ef7-goog
Adding the correct linux-mm address.
On Wed, Oct 2, 2019 at 1:25 PM Daniel Colascione <[email protected]> wrote:
>
> Using the new config option, users can disable SPLIT_RSS_COUNTING to
> get increased accuracy in user-visible mm counters.
>
> Signed-off-by: Daniel Colascione <[email protected]>
> ---
> include/linux/mm.h | 4 ++--
> include/linux/mm_types_task.h | 5 ++---
> include/linux/sched.h | 2 +-
> kernel/fork.c | 2 +-
> mm/Kconfig | 11 +++++++++++
> mm/memory.c | 6 +++---
> 6 files changed, 20 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index cc292273e6ba..221395de3cb4 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1637,7 +1637,7 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
> {
> long val = atomic_long_read(&mm->rss_stat.count[member]);
>
> -#ifdef SPLIT_RSS_COUNTING
> +#ifdef CONFIG_SPLIT_RSS_COUNTING
> /*
> * counter is updated in asynchronous manner and may go to minus.
> * But it's never be expected number for users.
> @@ -1723,7 +1723,7 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
> *maxrss = hiwater_rss;
> }
>
> -#if defined(SPLIT_RSS_COUNTING)
> +#ifdef CONFIG_SPLIT_RSS_COUNTING
> void sync_mm_rss(struct mm_struct *mm);
> #else
> static inline void sync_mm_rss(struct mm_struct *mm)
> diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
> index c1bc6731125c..d2adc8057e65 100644
> --- a/include/linux/mm_types_task.h
> +++ b/include/linux/mm_types_task.h
> @@ -48,14 +48,13 @@ enum {
> NR_MM_COUNTERS
> };
>
> -#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
> -#define SPLIT_RSS_COUNTING
> +#ifdef CONFIG_SPLIT_RSS_COUNTING
> /* per-thread cached information, */
> struct task_rss_stat {
> int events; /* for synchronization threshold */
> int count[NR_MM_COUNTERS];
> };
> -#endif /* USE_SPLIT_PTE_PTLOCKS */
> +#endif /* CONFIG_SPLIT_RSS_COUNTING */
>
> struct mm_rss_stat {
> atomic_long_t count[NR_MM_COUNTERS];
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 2c2e56bd8913..22f354774540 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -729,7 +729,7 @@ struct task_struct {
> /* Per-thread vma caching: */
> struct vmacache vmacache;
>
> -#ifdef SPLIT_RSS_COUNTING
> +#ifdef CONFIG_SPLIT_RSS_COUNTING
> struct task_rss_stat rss_stat;
> #endif
> int exit_state;
> diff --git a/kernel/fork.c b/kernel/fork.c
> index f9572f416126..fc5e0889922b 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1917,7 +1917,7 @@ static __latent_entropy struct task_struct *copy_process(
> p->vtime.state = VTIME_INACTIVE;
> #endif
>
> -#if defined(SPLIT_RSS_COUNTING)
> +#ifdef CONFIG_SPLIT_RSS_COUNTING
> memset(&p->rss_stat, 0, sizeof(p->rss_stat));
> #endif
>
> diff --git a/mm/Kconfig b/mm/Kconfig
> index a5dae9a7eb51..372ef9449924 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -736,4 +736,15 @@ config ARCH_HAS_PTE_SPECIAL
> config ARCH_HAS_HUGEPD
> bool
>
> +config SPLIT_RSS_COUNTING
> + bool "Per-thread mm counter caching"
> + depends on MMU
> + default y if NR_CPUS >= SPLIT_PTLOCK_CPUS
> + help
> + Cache mm counter updates in thread structures and
> + flush them to visible per-process statistics in batches.
> + Say Y here to slightly reduce cache contention in processes
> + with many threads at the expense of decreasing the accuracy
> + of memory statistics in /proc.
> +
> endmenu
> diff --git a/mm/memory.c b/mm/memory.c
> index b1ca51a079f2..bf557ed5ba23 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -141,7 +141,7 @@ static int __init init_zero_pfn(void)
> core_initcall(init_zero_pfn);
>
>
> -#if defined(SPLIT_RSS_COUNTING)
> +#ifdef CONFIG_SPLIT_RSS_COUNTING
>
> void sync_mm_rss(struct mm_struct *mm)
> {
> @@ -177,7 +177,7 @@ static void check_sync_rss_stat(struct task_struct *task)
> if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
> sync_mm_rss(task->mm);
> }
> -#else /* SPLIT_RSS_COUNTING */
> +#else /* CONFIG_SPLIT_RSS_COUNTING */
>
> #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
> #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
> @@ -186,7 +186,7 @@ static void check_sync_rss_stat(struct task_struct *task)
> {
> }
>
> -#endif /* SPLIT_RSS_COUNTING */
> +#endif /* CONFIG_SPLIT_RSS_COUNTING */
>
> /*
> * Note: this doesn't free the actual pages themselves. That
> --
> 2.23.0.581.g78d2f28ef7-goog
>
> On Oct 2, 2019, at 4:29 PM, Daniel Colascione <[email protected]> wrote:
>
> Adding the correct linux-mm address.
>
>
>> On Wed, Oct 2, 2019 at 1:25 PM Daniel Colascione <[email protected]> wrote:
>>
>> Using the new config option, users can disable SPLIT_RSS_COUNTING to
>> get increased accuracy in user-visible mm counters.
>>
>> Signed-off-by: Daniel Colascione <[email protected]>
>> ---
>> include/linux/mm.h | 4 ++--
>> include/linux/mm_types_task.h | 5 ++---
>> include/linux/sched.h | 2 +-
>> kernel/fork.c | 2 +-
>> mm/Kconfig | 11 +++++++++++
>> mm/memory.c | 6 +++---
>> 6 files changed, 20 insertions(+), 10 deletions(-)
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index cc292273e6ba..221395de3cb4 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -1637,7 +1637,7 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
>> {
>> long val = atomic_long_read(&mm->rss_stat.count[member]);
>>
>> -#ifdef SPLIT_RSS_COUNTING
>> +#ifdef CONFIG_SPLIT_RSS_COUNTING
>> /*
>> * counter is updated in asynchronous manner and may go to minus.
>> * But it's never be expected number for users.
>> @@ -1723,7 +1723,7 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
>> *maxrss = hiwater_rss;
>> }
>>
>> -#if defined(SPLIT_RSS_COUNTING)
>> +#ifdef CONFIG_SPLIT_RSS_COUNTING
>> void sync_mm_rss(struct mm_struct *mm);
>> #else
>> static inline void sync_mm_rss(struct mm_struct *mm)
>> diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
>> index c1bc6731125c..d2adc8057e65 100644
>> --- a/include/linux/mm_types_task.h
>> +++ b/include/linux/mm_types_task.h
>> @@ -48,14 +48,13 @@ enum {
>> NR_MM_COUNTERS
>> };
>>
>> -#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
>> -#define SPLIT_RSS_COUNTING
>> +#ifdef CONFIG_SPLIT_RSS_COUNTING
>> /* per-thread cached information, */
>> struct task_rss_stat {
>> int events; /* for synchronization threshold */
>> int count[NR_MM_COUNTERS];
>> };
>> -#endif /* USE_SPLIT_PTE_PTLOCKS */
>> +#endif /* CONFIG_SPLIT_RSS_COUNTING */
>>
>> struct mm_rss_stat {
>> atomic_long_t count[NR_MM_COUNTERS];
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 2c2e56bd8913..22f354774540 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -729,7 +729,7 @@ struct task_struct {
>> /* Per-thread vma caching: */
>> struct vmacache vmacache;
>>
>> -#ifdef SPLIT_RSS_COUNTING
>> +#ifdef CONFIG_SPLIT_RSS_COUNTING
>> struct task_rss_stat rss_stat;
>> #endif
>> int exit_state;
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index f9572f416126..fc5e0889922b 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -1917,7 +1917,7 @@ static __latent_entropy struct task_struct *copy_process(
>> p->vtime.state = VTIME_INACTIVE;
>> #endif
>>
>> -#if defined(SPLIT_RSS_COUNTING)
>> +#ifdef CONFIG_SPLIT_RSS_COUNTING
>> memset(&p->rss_stat, 0, sizeof(p->rss_stat));
>> #endif
>>
>> diff --git a/mm/Kconfig b/mm/Kconfig
>> index a5dae9a7eb51..372ef9449924 100644
>> --- a/mm/Kconfig
>> +++ b/mm/Kconfig
>> @@ -736,4 +736,15 @@ config ARCH_HAS_PTE_SPECIAL
>> config ARCH_HAS_HUGEPD
>> bool
>>
>> +config SPLIT_RSS_COUNTING
>> + bool "Per-thread mm counter caching"
>> + depends on MMU
>> + default y if NR_CPUS >= SPLIT_PTLOCK_CPUS
>> + help
>> + Cache mm counter updates in thread structures and
>> + flush them to visible per-process statistics in batches.
>> + Say Y here to slightly reduce cache contention in processes
>> + with many threads at the expense of decreasing the accuracy
>> + of memory statistics in /proc.
>> +
>> endmenu
All those vague words are going to make developers almost impossible to decide the right selection here. It sounds like we should kill SPLIT_RSS_COUNTING at all to simplify the code as the benefit is so small vs the side-effect?
>> diff --git a/mm/memory.c b/mm/memory.c
>> index b1ca51a079f2..bf557ed5ba23 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -141,7 +141,7 @@ static int __init init_zero_pfn(void)
>> core_initcall(init_zero_pfn);
>>
>>
>> -#if defined(SPLIT_RSS_COUNTING)
>> +#ifdef CONFIG_SPLIT_RSS_COUNTING
>>
>> void sync_mm_rss(struct mm_struct *mm)
>> {
>> @@ -177,7 +177,7 @@ static void check_sync_rss_stat(struct task_struct *task)
>> if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
>> sync_mm_rss(task->mm);
>> }
>> -#else /* SPLIT_RSS_COUNTING */
>> +#else /* CONFIG_SPLIT_RSS_COUNTING */
>>
>> #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
>> #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
>> @@ -186,7 +186,7 @@ static void check_sync_rss_stat(struct task_struct *task)
>> {
>> }
>>
>> -#endif /* SPLIT_RSS_COUNTING */
>> +#endif /* CONFIG_SPLIT_RSS_COUNTING */
>>
>> /*
>> * Note: this doesn't free the actual pages themselves. That
>> --
>> 2.23.0.581.g78d2f28ef7-goog
>>
On Wed, Oct 2, 2019 at 6:56 PM Qian Cai <[email protected]> wrote:
> > On Oct 2, 2019, at 4:29 PM, Daniel Colascione <[email protected]> wrote:
> >
> > Adding the correct linux-mm address.
> >
> >
> >> On Wed, Oct 2, 2019 at 1:25 PM Daniel Colascione <[email protected]> wrote:
> >>
> >> Using the new config option, users can disable SPLIT_RSS_COUNTING to
> >> get increased accuracy in user-visible mm counters.
> >>
> >> Signed-off-by: Daniel Colascione <[email protected]>
> >> ---
> >> include/linux/mm.h | 4 ++--
> >> include/linux/mm_types_task.h | 5 ++---
> >> include/linux/sched.h | 2 +-
> >> kernel/fork.c | 2 +-
> >> mm/Kconfig | 11 +++++++++++
> >> mm/memory.c | 6 +++---
> >> 6 files changed, 20 insertions(+), 10 deletions(-)
> >>
> >> diff --git a/include/linux/mm.h b/include/linux/mm.h
> >> index cc292273e6ba..221395de3cb4 100644
> >> --- a/include/linux/mm.h
> >> +++ b/include/linux/mm.h
> >> @@ -1637,7 +1637,7 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
> >> {
> >> long val = atomic_long_read(&mm->rss_stat.count[member]);
> >>
> >> -#ifdef SPLIT_RSS_COUNTING
> >> +#ifdef CONFIG_SPLIT_RSS_COUNTING
> >> /*
> >> * counter is updated in asynchronous manner and may go to minus.
> >> * But it's never be expected number for users.
> >> @@ -1723,7 +1723,7 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
> >> *maxrss = hiwater_rss;
> >> }
> >>
> >> -#if defined(SPLIT_RSS_COUNTING)
> >> +#ifdef CONFIG_SPLIT_RSS_COUNTING
> >> void sync_mm_rss(struct mm_struct *mm);
> >> #else
> >> static inline void sync_mm_rss(struct mm_struct *mm)
> >> diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
> >> index c1bc6731125c..d2adc8057e65 100644
> >> --- a/include/linux/mm_types_task.h
> >> +++ b/include/linux/mm_types_task.h
> >> @@ -48,14 +48,13 @@ enum {
> >> NR_MM_COUNTERS
> >> };
> >>
> >> -#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
> >> -#define SPLIT_RSS_COUNTING
> >> +#ifdef CONFIG_SPLIT_RSS_COUNTING
> >> /* per-thread cached information, */
> >> struct task_rss_stat {
> >> int events; /* for synchronization threshold */
> >> int count[NR_MM_COUNTERS];
> >> };
> >> -#endif /* USE_SPLIT_PTE_PTLOCKS */
> >> +#endif /* CONFIG_SPLIT_RSS_COUNTING */
> >>
> >> struct mm_rss_stat {
> >> atomic_long_t count[NR_MM_COUNTERS];
> >> diff --git a/include/linux/sched.h b/include/linux/sched.h
> >> index 2c2e56bd8913..22f354774540 100644
> >> --- a/include/linux/sched.h
> >> +++ b/include/linux/sched.h
> >> @@ -729,7 +729,7 @@ struct task_struct {
> >> /* Per-thread vma caching: */
> >> struct vmacache vmacache;
> >>
> >> -#ifdef SPLIT_RSS_COUNTING
> >> +#ifdef CONFIG_SPLIT_RSS_COUNTING
> >> struct task_rss_stat rss_stat;
> >> #endif
> >> int exit_state;
> >> diff --git a/kernel/fork.c b/kernel/fork.c
> >> index f9572f416126..fc5e0889922b 100644
> >> --- a/kernel/fork.c
> >> +++ b/kernel/fork.c
> >> @@ -1917,7 +1917,7 @@ static __latent_entropy struct task_struct *copy_process(
> >> p->vtime.state = VTIME_INACTIVE;
> >> #endif
> >>
> >> -#if defined(SPLIT_RSS_COUNTING)
> >> +#ifdef CONFIG_SPLIT_RSS_COUNTING
> >> memset(&p->rss_stat, 0, sizeof(p->rss_stat));
> >> #endif
> >>
> >> diff --git a/mm/Kconfig b/mm/Kconfig
> >> index a5dae9a7eb51..372ef9449924 100644
> >> --- a/mm/Kconfig
> >> +++ b/mm/Kconfig
> >> @@ -736,4 +736,15 @@ config ARCH_HAS_PTE_SPECIAL
> >> config ARCH_HAS_HUGEPD
> >> bool
> >>
> >> +config SPLIT_RSS_COUNTING
> >> + bool "Per-thread mm counter caching"
> >> + depends on MMU
> >> + default y if NR_CPUS >= SPLIT_PTLOCK_CPUS
> >> + help
> >> + Cache mm counter updates in thread structures and
> >> + flush them to visible per-process statistics in batches.
> >> + Say Y here to slightly reduce cache contention in processes
> >> + with many threads at the expense of decreasing the accuracy
> >> + of memory statistics in /proc.
> >> +
> >> endmenu
>
> All those vague words are going to make developers almost impossible to decide the right selection here. It sounds like we should kill SPLIT_RSS_COUNTING at all to simplify the code as the benefit is so small vs the side-effect?
Killing SPLIT_RSS_COUNTING would be my first choice; IME, on mobile
and a basic desktop, it doesn't make a difference. I figured making it
a knob would help allay concerns about the performance impact in more
extreme configurations.
On Wed 02-10-19 19:08:16, Daniel Colascione wrote:
> On Wed, Oct 2, 2019 at 6:56 PM Qian Cai <[email protected]> wrote:
> > > On Oct 2, 2019, at 4:29 PM, Daniel Colascione <[email protected]> wrote:
> > >
> > > Adding the correct linux-mm address.
> > >
> > >
> > >> +config SPLIT_RSS_COUNTING
> > >> + bool "Per-thread mm counter caching"
> > >> + depends on MMU
> > >> + default y if NR_CPUS >= SPLIT_PTLOCK_CPUS
> > >> + help
> > >> + Cache mm counter updates in thread structures and
> > >> + flush them to visible per-process statistics in batches.
> > >> + Say Y here to slightly reduce cache contention in processes
> > >> + with many threads at the expense of decreasing the accuracy
> > >> + of memory statistics in /proc.
> > >> +
> > >> endmenu
> >
> > All those vague words are going to make developers almost impossible to decide the right selection here. It sounds like we should kill SPLIT_RSS_COUNTING at all to simplify the code as the benefit is so small vs the side-effect?
>
> Killing SPLIT_RSS_COUNTING would be my first choice; IME, on mobile
> and a basic desktop, it doesn't make a difference. I figured making it
> a knob would help allay concerns about the performance impact in more
> extreme configurations.
I do agree with Qian. Either it is really helpful (is it? probably on
the number of cpus) and it should be auto-enabled or it should be
dropped altogether. You cannot really expect people know how to enable
this without a deep understanding of the MM internals. Not to mention
all those users using distro kernels/configs.
A config option sounds like a bad way forward.
--
Michal Hocko
SUSE Labs
On Fri, Oct 04, 2019 at 02:33:49PM +0200, Michal Hocko wrote:
> On Wed 02-10-19 19:08:16, Daniel Colascione wrote:
> > On Wed, Oct 2, 2019 at 6:56 PM Qian Cai <[email protected]> wrote:
> > > > On Oct 2, 2019, at 4:29 PM, Daniel Colascione <[email protected]> wrote:
> > > >
> > > > Adding the correct linux-mm address.
> > > >
> > > >
> > > >> +config SPLIT_RSS_COUNTING
> > > >> + bool "Per-thread mm counter caching"
> > > >> + depends on MMU
> > > >> + default y if NR_CPUS >= SPLIT_PTLOCK_CPUS
> > > >> + help
> > > >> + Cache mm counter updates in thread structures and
> > > >> + flush them to visible per-process statistics in batches.
> > > >> + Say Y here to slightly reduce cache contention in processes
> > > >> + with many threads at the expense of decreasing the accuracy
> > > >> + of memory statistics in /proc.
> > > >> +
> > > >> endmenu
> > >
> > > All those vague words are going to make developers almost impossible to decide the right selection here. It sounds like we should kill SPLIT_RSS_COUNTING at all to simplify the code as the benefit is so small vs the side-effect?
> >
> > Killing SPLIT_RSS_COUNTING would be my first choice; IME, on mobile
> > and a basic desktop, it doesn't make a difference. I figured making it
> > a knob would help allay concerns about the performance impact in more
> > extreme configurations.
>
> I do agree with Qian. Either it is really helpful (is it? probably on
> the number of cpus) and it should be auto-enabled or it should be
> dropped altogether. You cannot really expect people know how to enable
> this without a deep understanding of the MM internals. Not to mention
> all those users using distro kernels/configs.
>
> A config option sounds like a bad way forward.
And I don't see much point anyway. Reading RSS counters from proc is
inherently racy. It can just either way after the read due to process
behaviour.
--
Kirill A. Shutemov
On Fri, Oct 4, 2019 at 6:26 AM Kirill A. Shutemov <[email protected]> wrote:
> On Fri, Oct 04, 2019 at 02:33:49PM +0200, Michal Hocko wrote:
> > On Wed 02-10-19 19:08:16, Daniel Colascione wrote:
> > > On Wed, Oct 2, 2019 at 6:56 PM Qian Cai <[email protected]> wrote:
> > > > > On Oct 2, 2019, at 4:29 PM, Daniel Colascione <[email protected]> wrote:
> > > > >
> > > > > Adding the correct linux-mm address.
> > > > >
> > > > >
> > > > >> +config SPLIT_RSS_COUNTING
> > > > >> + bool "Per-thread mm counter caching"
> > > > >> + depends on MMU
> > > > >> + default y if NR_CPUS >= SPLIT_PTLOCK_CPUS
> > > > >> + help
> > > > >> + Cache mm counter updates in thread structures and
> > > > >> + flush them to visible per-process statistics in batches.
> > > > >> + Say Y here to slightly reduce cache contention in processes
> > > > >> + with many threads at the expense of decreasing the accuracy
> > > > >> + of memory statistics in /proc.
> > > > >> +
> > > > >> endmenu
> > > >
> > > > All those vague words are going to make developers almost impossible to decide the right selection here. It sounds like we should kill SPLIT_RSS_COUNTING at all to simplify the code as the benefit is so small vs the side-effect?
> > >
> > > Killing SPLIT_RSS_COUNTING would be my first choice; IME, on mobile
> > > and a basic desktop, it doesn't make a difference. I figured making it
> > > a knob would help allay concerns about the performance impact in more
> > > extreme configurations.
> >
> > I do agree with Qian. Either it is really helpful (is it? probably on
> > the number of cpus) and it should be auto-enabled or it should be
> > dropped altogether. You cannot really expect people know how to enable
> > this without a deep understanding of the MM internals. Not to mention
> > all those users using distro kernels/configs.
> >
> > A config option sounds like a bad way forward.
>
> And I don't see much point anyway. Reading RSS counters from proc is
> inherently racy. It can just either way after the read due to process
> behaviour.
Split RSS accounting doesn't make reading from mm counters racy. It
makes these counters *wrong*. We flush task mm counters to the
mm_struct once every 64 page faults that a task incurs or when that
task exits. That means that if a thread takes 63 page faults and then
sleeps for a week, that thread's process's mm counters are wrong by 63
pages *for a week*. And some processes have a lot of threads,
compounding the error. Split RSS accounting means that memory usage
numbers don't add up. I don't think it's unreasonable to want a mode
where memory counters to agree with other indicators of system
activity.
Nobody has demonstrated that split RSS accounting actually helps in
the real world. But I've described above, concretely, how split RSS
accounting hurts. I've been trying for over a year to either disable
split RSS accounting or to let people opt out of it. If you won't
remove split RSS accounting and you won't let me add a configuration
knob that lets people opt out of it, what will you accept?
On Fri 04-10-19 06:45:21, Daniel Colascione wrote:
[...]
> Nobody has demonstrated that split RSS accounting actually helps in
> the real world. But I've described above, concretely, how split RSS
> accounting hurts. I've been trying for over a year to either disable
> split RSS accounting or to let people opt out of it. If you won't
> remove split RSS accounting and you won't let me add a configuration
> knob that lets people opt out of it, what will you accept?
What is an argument to keep it around?
--
Michal Hocko
SUSE Labs
On Fri, Oct 04, 2019 at 06:45:21AM -0700, Daniel Colascione wrote:
> On Fri, Oct 4, 2019 at 6:26 AM Kirill A. Shutemov <[email protected]> wrote:
> > On Fri, Oct 04, 2019 at 02:33:49PM +0200, Michal Hocko wrote:
> > > On Wed 02-10-19 19:08:16, Daniel Colascione wrote:
> > > > On Wed, Oct 2, 2019 at 6:56 PM Qian Cai <[email protected]> wrote:
> > > > > > On Oct 2, 2019, at 4:29 PM, Daniel Colascione <[email protected]> wrote:
> > > > > >
> > > > > > Adding the correct linux-mm address.
> > > > > >
> > > > > >
> > > > > >> +config SPLIT_RSS_COUNTING
> > > > > >> + bool "Per-thread mm counter caching"
> > > > > >> + depends on MMU
> > > > > >> + default y if NR_CPUS >= SPLIT_PTLOCK_CPUS
> > > > > >> + help
> > > > > >> + Cache mm counter updates in thread structures and
> > > > > >> + flush them to visible per-process statistics in batches.
> > > > > >> + Say Y here to slightly reduce cache contention in processes
> > > > > >> + with many threads at the expense of decreasing the accuracy
> > > > > >> + of memory statistics in /proc.
> > > > > >> +
> > > > > >> endmenu
> > > > >
> > > > > All those vague words are going to make developers almost
> > > > > impossible to decide the right selection here. It sounds like we
> > > > > should kill SPLIT_RSS_COUNTING at all to simplify the code as
> > > > > the benefit is so small vs the side-effect?
> > > >
> > > > Killing SPLIT_RSS_COUNTING would be my first choice; IME, on mobile
> > > > and a basic desktop, it doesn't make a difference. I figured making it
> > > > a knob would help allay concerns about the performance impact in more
> > > > extreme configurations.
> > >
> > > I do agree with Qian. Either it is really helpful (is it? probably on
> > > the number of cpus) and it should be auto-enabled or it should be
> > > dropped altogether. You cannot really expect people know how to enable
> > > this without a deep understanding of the MM internals. Not to mention
> > > all those users using distro kernels/configs.
> > >
> > > A config option sounds like a bad way forward.
> >
> > And I don't see much point anyway. Reading RSS counters from proc is
> > inherently racy. It can just either way after the read due to process
> > behaviour.
>
> Split RSS accounting doesn't make reading from mm counters racy. It
> makes these counters *wrong*. We flush task mm counters to the
> mm_struct once every 64 page faults that a task incurs or when that
> task exits. That means that if a thread takes 63 page faults and then
> sleeps for a week, that thread's process's mm counters are wrong by 63
> pages *for a week*. And some processes have a lot of threads,
> compounding the error. Split RSS accounting means that memory usage
> numbers don't add up. I don't think it's unreasonable to want a mode
> where memory counters to agree with other indicators of system
> activity.
It's documented behaviour that is upstream for 9 years. Why is your workload
special?
The documentation suggests to use smaps if you want to have precise data.
Why would it not fly for you?
> Nobody has demonstrated that split RSS accounting actually helps in
> the real world.
The original commit 34e55232e59f ("mm: avoid false sharing of mm_counter")
shows numbers on cache misses. It's not a real world workload, but you
don't have any numbers at all to back your claim.
> But I've described above, concretely, how split RSS
> accounting hurts. I've been trying for over a year to either disable
> split RSS accounting or to let people opt out of it. If you won't
> remove split RSS accounting and you won't let me add a configuration
> knob that lets people opt out of it, what will you accept?
Keeping stats precise is welcome, but often expensive. It might be
negligible for small machine, but becomes a problem on multisocket
machine with dozens or hundreds of cores. We need to keep kernel scalable.
We have other stats that update asynchronously (i.e. /proc/vmstat). Would
you like to convert them too?
--
Kirill A. Shutemov