Provide a kernel stack usage histogram to aid in optimizing kernel stack
sizes and minimizing memory waste in large-scale environments. The
histogram divides stack usage into power-of-two buckets and reports the
results in /proc/vmstat. This information is especially valuable in
environments with millions of machines, where even small optimizations
can have a significant impact.
The histogram data is presented in /proc/vmstat with entries like
"kstack_1k", "kstack_2k", and so on, indicating the number of threads
that exited with stack usage falling within each respective bucket.
Example outputs:
Intel:
$ grep kstack /proc/vmstat
kstack_1k 3
kstack_2k 188
kstack_4k 11391
kstack_8k 243
kstack_16k 0
ARM with 64K page_size:
$ grep kstack /proc/vmstat
kstack_1k 1
kstack_2k 340
kstack_4k 25212
kstack_8k 1659
kstack_16k 0
kstack_32k 0
kstack_64k 0
Signed-off-by: Pasha Tatashin <[email protected]>
---
Changelog:
v3:
- Changed from page counts to power-of-two buckets, this is helpful for
builds with large base pages (i.e. arm64 with 64K pages) to evaluate
kernel stack internal fragmentation.
include/linux/sched/task_stack.h | 49 ++++++++++++++++++++++++++++++--
include/linux/vm_event_item.h | 42 +++++++++++++++++++++++++++
include/linux/vmstat.h | 16 -----------
mm/vmstat.c | 24 ++++++++++++++++
4 files changed, 113 insertions(+), 18 deletions(-)
diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index ccd72b978e1f..65e8c9fb7f9b 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -95,9 +95,51 @@ static inline int object_is_on_stack(const void *obj)
extern void thread_stack_cache_init(void);
#ifdef CONFIG_DEBUG_STACK_USAGE
+#ifdef CONFIG_VM_EVENT_COUNTERS
+#include <linux/vm_event_item.h>
+
+/* Count the maximum pages reached in kernel stacks */
+static inline void kstack_histogram(unsigned long used_stack)
+{
+ if (used_stack <= 1024)
+ this_cpu_inc(vm_event_states.event[KSTACK_1K]);
+#if THREAD_SIZE > 1024
+ else if (used_stack <= 2048)
+ this_cpu_inc(vm_event_states.event[KSTACK_2K]);
+#endif
+#if THREAD_SIZE > 2048
+ else if (used_stack <= 4096)
+ this_cpu_inc(vm_event_states.event[KSTACK_4K]);
+#endif
+#if THREAD_SIZE > 4096
+ else if (used_stack <= 8192)
+ this_cpu_inc(vm_event_states.event[KSTACK_8K]);
+#endif
+#if THREAD_SIZE > 8192
+ else if (used_stack <= 16384)
+ this_cpu_inc(vm_event_states.event[KSTACK_16K]);
+#endif
+#if THREAD_SIZE > 16384
+ else if (used_stack <= 32768)
+ this_cpu_inc(vm_event_states.event[KSTACK_32K]);
+#endif
+#if THREAD_SIZE > 32768
+ else if (used_stack <= 65536)
+ this_cpu_inc(vm_event_states.event[KSTACK_64K]);
+#endif
+#if THREAD_SIZE > 65536
+ else
+ this_cpu_inc(vm_event_states.event[KSTACK_REST]);
+#endif
+}
+#else /* !CONFIG_VM_EVENT_COUNTERS */
+static inline void kstack_histogram(unsigned long used_stack) {}
+#endif /* CONFIG_VM_EVENT_COUNTERS */
+
static inline unsigned long stack_not_used(struct task_struct *p)
{
unsigned long *n = end_of_stack(p);
+ unsigned long unused_stack;
do { /* Skip over canary */
# ifdef CONFIG_STACK_GROWSUP
@@ -108,10 +150,13 @@ static inline unsigned long stack_not_used(struct task_struct *p)
} while (!*n);
# ifdef CONFIG_STACK_GROWSUP
- return (unsigned long)end_of_stack(p) - (unsigned long)n;
+ unused_stack = (unsigned long)end_of_stack(p) - (unsigned long)n;
# else
- return (unsigned long)n - (unsigned long)end_of_stack(p);
+ unused_stack = (unsigned long)n - (unsigned long)end_of_stack(p);
# endif
+ kstack_histogram(THREAD_SIZE - unused_stack);
+
+ return unused_stack;
}
#endif
extern void set_task_stack_end_magic(struct task_struct *tsk);
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 747943bc8cc2..73fa5fbf33a3 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -154,9 +154,51 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
VMA_LOCK_RETRY,
VMA_LOCK_MISS,
#endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+ KSTACK_1K,
+#if THREAD_SIZE > 1024
+ KSTACK_2K,
+#endif
+#if THREAD_SIZE > 2048
+ KSTACK_4K,
+#endif
+#if THREAD_SIZE > 4096
+ KSTACK_8K,
+#endif
+#if THREAD_SIZE > 8192
+ KSTACK_16K,
+#endif
+#if THREAD_SIZE > 16384
+ KSTACK_32K,
+#endif
+#if THREAD_SIZE > 32768
+ KSTACK_64K,
+#endif
+#if THREAD_SIZE > 65536
+ KSTACK_REST,
+#endif
+#endif /* CONFIG_DEBUG_STACK_USAGE */
NR_VM_EVENT_ITEMS
};
+#ifdef CONFIG_VM_EVENT_COUNTERS
+/*
+ * Light weight per cpu counter implementation.
+ *
+ * Counters should only be incremented and no critical kernel component
+ * should rely on the counter values.
+ *
+ * Counters are handled completely inline. On many platforms the code
+ * generated will simply be the increment of a global address.
+ */
+
+struct vm_event_state {
+ unsigned long event[NR_VM_EVENT_ITEMS];
+};
+
+DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
+#endif
+
#ifndef CONFIG_TRANSPARENT_HUGEPAGE
#define THP_FILE_ALLOC ({ BUILD_BUG(); 0; })
#define THP_FILE_FALLBACK ({ BUILD_BUG(); 0; })
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 343906a98d6e..18d4a97d3afd 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -41,22 +41,6 @@ enum writeback_stat_item {
};
#ifdef CONFIG_VM_EVENT_COUNTERS
-/*
- * Light weight per cpu counter implementation.
- *
- * Counters should only be incremented and no critical kernel component
- * should rely on the counter values.
- *
- * Counters are handled completely inline. On many platforms the code
- * generated will simply be the increment of a global address.
- */
-
-struct vm_event_state {
- unsigned long event[NR_VM_EVENT_ITEMS];
-};
-
-DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
-
/*
* vm counters are allowed to be racy. Use raw_cpu_ops to avoid the
* local_irq_disable overhead.
diff --git a/mm/vmstat.c b/mm/vmstat.c
index db79935e4a54..21932bd6a449 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1413,6 +1413,30 @@ const char * const vmstat_text[] = {
"vma_lock_retry",
"vma_lock_miss",
#endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+ "kstack_1k",
+#if THREAD_SIZE > 1024
+ "kstack_2k",
+#endif
+#if THREAD_SIZE > 2048
+ "kstack_4k",
+#endif
+#if THREAD_SIZE > 4096
+ "kstack_8k",
+#endif
+#if THREAD_SIZE > 8192
+ "kstack_16k",
+#endif
+#if THREAD_SIZE > 16384
+ "kstack_32k",
+#endif
+#if THREAD_SIZE > 32768
+ "kstack_64k",
+#endif
+#if THREAD_SIZE > 65536
+ "kstack_rest",
+#endif
+#endif
#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
};
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
--
2.45.1.288.g0e0cd299f1-goog
Hi Pasha,
On Thu, May 30, 2024 at 05:02:59PM GMT, Pasha Tatashin wrote:
> Provide a kernel stack usage histogram to aid in optimizing kernel stack
> sizes and minimizing memory waste in large-scale environments. The
> histogram divides stack usage into power-of-two buckets and reports the
> results in /proc/vmstat. This information is especially valuable in
> environments with millions of machines, where even small optimizations
> can have a significant impact.
>
> The histogram data is presented in /proc/vmstat with entries like
> "kstack_1k", "kstack_2k", and so on, indicating the number of threads
> that exited with stack usage falling within each respective bucket.
>
> Example outputs:
> Intel:
> $ grep kstack /proc/vmstat
> kstack_1k 3
> kstack_2k 188
> kstack_4k 11391
> kstack_8k 243
> kstack_16k 0
>
> ARM with 64K page_size:
> $ grep kstack /proc/vmstat
> kstack_1k 1
> kstack_2k 340
> kstack_4k 25212
> kstack_8k 1659
> kstack_16k 0
> kstack_32k 0
> kstack_64k 0
>
> Signed-off-by: Pasha Tatashin <[email protected]>
Couple of questions:
1. In future with your on-demand kstack allocation feature, will these
metrics still be useful? (I think so but I want to know your take)
2. With on-demand kstack allocation, the stack_not_used() needs to be
changed to not cause the allocation, right?
3. Does the histogram get updated on exit only? What about long running
kernel threads whose will never exit?
thanks,
Shakeel
Hi Shakeel,
> Couple of questions:
>
> 1. In future with your on-demand kstack allocation feature, will these
> metrics still be useful? (I think so but I want to know your take)
It depends on how on-demand allocation is implemented. On hardware
that supports faults on kernel stacks, we will have other metrics that
show the total number of pages allocated for stacks. On hardware where
faults are not supported, we will most likely have some optimization
where only some threads are extended, and for those, these metrics
will still be very useful.
> 2. With on-demand kstack allocation, the stack_not_used() needs to be
> changed to not cause the allocation, right?
This is correct, in my WIP dynamic kernel tasks RFCv2 patch series, I
have an optimized version of stack_not_used() that uses the number of
allocated pages in the partially filled vmap to determine the last
stack address.
> 3. Does the histogram get updated on exit only? What about long running
> kernel threads whose will never exit?
Yes, for performance reasons, the histogram is updated only on exit.
It would be too expensive to calculate for all running tasks. However,
it could be extended to be queried on demand via a debugfs interface
for all running threads. On machines where jobs come and go over time,
this histogram will show the actual stack usage distribution.
Thank you,
Pasha
> thanks,
> Shakeel
On Thu, May 30, 2024 at 08:14:17PM GMT, Pasha Tatashin wrote:
> Hi Shakeel,
>
> > Couple of questions:
> >
> > 1. In future with your on-demand kstack allocation feature, will these
> > metrics still be useful? (I think so but I want to know your take)
>
> It depends on how on-demand allocation is implemented. On hardware
> that supports faults on kernel stacks,
Which hardware supports faults on kernel stacks and which do not?
> we will have other metrics that
> show the total number of pages allocated for stacks.
Don't we already have a metric for that i.e. KernelStack in meminfo
which is in kB unit?
One more question: Is there any concern in making
CONFIG_DEBUG_STACK_USAGE not a debug feature i.e. enable in default
kernels instead of just debug kernels?
thanks,
Shakeel
On Thu, 30 May 2024 20:14:17 -0400 Pasha Tatashin <[email protected]> wrote:
> > Couple of questions:
> >
> > 1. In future with your on-demand kstack allocation feature, will these
> > metrics still be useful? (I think so but I want to know your take)
I do think the changelog for this patch should reference the dynamic
stack feature. It strengthens the justification for adding this patch.
> It depends on how on-demand allocation is implemented. On hardware
> that supports faults on kernel stacks, we will have other metrics that
> show the total number of pages allocated for stacks. On hardware where
> faults are not supported, we will most likely have some optimization
> where only some threads are extended, and for those, these metrics
> will still be very useful.
Also useful changelog info for this patch.
Pasha, could you please prepare some additional text which I can paste
in? Thanks.
On Fri, May 31, 2024 at 8:38 PM Andrew Morton <[email protected]> wrote:
>
> On Thu, 30 May 2024 20:14:17 -0400 Pasha Tatashin <[email protected]> wrote:
>
> > > Couple of questions:
> > >
> > > 1. In future with your on-demand kstack allocation feature, will these
> > > metrics still be useful? (I think so but I want to know your take)
>
> I do think the changelog for this patch should reference the dynamic
> stack feature. It strengthens the justification for adding this patch.
I will add a reference.
>
> > It depends on how on-demand allocation is implemented. On hardware
> > that supports faults on kernel stacks, we will have other metrics that
> > show the total number of pages allocated for stacks. On hardware where
> > faults are not supported, we will most likely have some optimization
> > where only some threads are extended, and for those, these metrics
> > will still be very useful.
>
> Also useful changelog info for this patch.
>
> Pasha, could you please prepare some additional text which I can paste
> in? Thanks.
Yes, this makes sense, I will send a new version with the updated
commit log soon.
Thank you,
Pasha
Hi Pasha, I think you might have missed the questions I had below. Your
response would really be appreciated.
On Fri, May 31, 2024 at 03:42:34PM GMT, Shakeel Butt wrote:
> On Thu, May 30, 2024 at 08:14:17PM GMT, Pasha Tatashin wrote:
> > Hi Shakeel,
> >
> > > Couple of questions:
> > >
> > > 1. In future with your on-demand kstack allocation feature, will these
> > > metrics still be useful? (I think so but I want to know your take)
> >
> > It depends on how on-demand allocation is implemented. On hardware
> > that supports faults on kernel stacks,
>
> Which hardware supports faults on kernel stacks and which do not?
>
> > we will have other metrics that
> > show the total number of pages allocated for stacks.
>
> Don't we already have a metric for that i.e. KernelStack in meminfo
> which is in kB unit?
>
> One more question: Is there any concern in making
> CONFIG_DEBUG_STACK_USAGE not a debug feature i.e. enable in default
> kernels instead of just debug kernels?
>
> thanks,
> Shakeel
>