From: Zhaoyang Huang <[email protected]>
Uninterruptible sleep has not been monitored as an important system status yet.
Imagin that a set of psi triggers are created for monitoring a special group, while
get nothing high for none of the pressures, which could be the processes within
are stock in some given resources and turn to be UN status. Introduce PSI_UN as
a sub-type among PSI system here.
Signed-off-by: Zhaoyang Huang <[email protected]>
---
include/linux/psi_types.h | 11 ++++++++---
kernel/sched/psi.c | 10 ++++++++++
kernel/sched/stats.h | 6 +++++-
3 files changed, 23 insertions(+), 4 deletions(-)
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index c7fe7c0..8cc1979 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -15,6 +15,7 @@ enum psi_task_count {
NR_IOWAIT,
NR_MEMSTALL,
NR_RUNNING,
+ NR_UNINTERRUPTIBLE,
/*
* This can't have values other than 0 or 1 and could be
* implemented as a bit flag. But for now we still have room
@@ -32,7 +33,7 @@ enum psi_task_count {
* threads and memstall ones.
*/
NR_MEMSTALL_RUNNING,
- NR_PSI_TASK_COUNTS = 5,
+ NR_PSI_TASK_COUNTS = 6,
};
/* Task state bitmasks */
@@ -41,13 +42,15 @@ enum psi_task_count {
#define TSK_RUNNING (1 << NR_RUNNING)
#define TSK_ONCPU (1 << NR_ONCPU)
#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)
+#define TSK_UNINTERRUPTIBLE (1 << NR_UNINTERRUPTIBLE)
/* Resources that workloads could be stalled on */
enum psi_res {
PSI_IO,
PSI_MEM,
PSI_CPU,
- NR_PSI_RESOURCES = 3,
+ PSI_UN,
+ NR_PSI_RESOURCES = 4,
};
/*
@@ -63,9 +66,11 @@ enum psi_states {
PSI_MEM_FULL,
PSI_CPU_SOME,
PSI_CPU_FULL,
+ PSI_UN_SOME,
+ PSI_UN_FULL,
/* Only per-CPU, to weigh the CPU in the global average: */
PSI_NONIDLE,
- NR_PSI_STATES = 7,
+ NR_PSI_STATES = 9,
};
enum psi_aggregators {
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index a337f3e..a37b4a4 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -231,6 +231,10 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
case PSI_CPU_FULL:
return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
+ case PSI_UN_SOME:
+ return unlikely(tasks[NR_UNINTERRUPTIBLE]);
+ case PSI_UN_FULL:
+ return unlikely(tasks[NR_UNINTERRUPTIBLE] && !tasks[NR_RUNNING]);
case PSI_NONIDLE:
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
tasks[NR_RUNNING];
@@ -683,6 +687,12 @@ static void record_times(struct psi_group_cpu *groupc, u64 now)
groupc->times[PSI_CPU_FULL] += delta;
}
+ if (groupc->state_mask & (1 << PSI_UN_SOME)) {
+ groupc->times[PSI_UN_SOME] += delta;
+ if (groupc->state_mask & (1 << PSI_UN_FULL))
+ groupc->times[PSI_UN_FULL] += delta;
+ }
+
if (groupc->state_mask & (1 << PSI_NONIDLE))
groupc->times[PSI_NONIDLE] += delta;
}
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index baa839c..bf98829 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -132,6 +132,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
if (p->in_iowait)
clear |= TSK_IOWAIT;
}
+ clear |= TSK_UNINTERRUPTIBLE;
psi_task_change(p, clear, set);
}
@@ -139,6 +140,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
static inline void psi_dequeue(struct task_struct *p, bool sleep)
{
int clear = TSK_RUNNING;
+ int set = 0;
if (static_branch_likely(&psi_disabled))
return;
@@ -154,8 +156,10 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
if (p->in_memstall)
clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
+ if (READ_ONCE(p->__state) & TASK_UNINTERRUPTIBLE)
+ set = TSK_UNINTERRUPTIBLE;
- psi_task_change(p, clear, 0);
+ psi_task_change(p, clear, set);
}
static inline void psi_ttwu_dequeue(struct task_struct *p)
--
1.9.1
On 2022/8/8 14:13, zhaoyang.huang wrote:
> From: Zhaoyang Huang <[email protected]>
>
> Uninterruptible sleep has not been monitored as an important system status yet.
> Imagin that a set of psi triggers are created for monitoring a special group, while
> get nothing high for none of the pressures, which could be the processes within
> are stock in some given resources and turn to be UN status. Introduce PSI_UN as
> a sub-type among PSI system here.
Hello,
The problem is that not all TASK_UNINTERRUPTIBLE task means stalled on some
shared resource, like many schedule_timeout() paths.
Thanks.
>
> Signed-off-by: Zhaoyang Huang <[email protected]>
> ---
> include/linux/psi_types.h | 11 ++++++++---
> kernel/sched/psi.c | 10 ++++++++++
> kernel/sched/stats.h | 6 +++++-
> 3 files changed, 23 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
> index c7fe7c0..8cc1979 100644
> --- a/include/linux/psi_types.h
> +++ b/include/linux/psi_types.h
> @@ -15,6 +15,7 @@ enum psi_task_count {
> NR_IOWAIT,
> NR_MEMSTALL,
> NR_RUNNING,
> + NR_UNINTERRUPTIBLE,
> /*
> * This can't have values other than 0 or 1 and could be
> * implemented as a bit flag. But for now we still have room
> @@ -32,7 +33,7 @@ enum psi_task_count {
> * threads and memstall ones.
> */
> NR_MEMSTALL_RUNNING,
> - NR_PSI_TASK_COUNTS = 5,
> + NR_PSI_TASK_COUNTS = 6,
> };
>
> /* Task state bitmasks */
> @@ -41,13 +42,15 @@ enum psi_task_count {
> #define TSK_RUNNING (1 << NR_RUNNING)
> #define TSK_ONCPU (1 << NR_ONCPU)
> #define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)
> +#define TSK_UNINTERRUPTIBLE (1 << NR_UNINTERRUPTIBLE)
>
> /* Resources that workloads could be stalled on */
> enum psi_res {
> PSI_IO,
> PSI_MEM,
> PSI_CPU,
> - NR_PSI_RESOURCES = 3,
> + PSI_UN,
> + NR_PSI_RESOURCES = 4,
> };
>
> /*
> @@ -63,9 +66,11 @@ enum psi_states {
> PSI_MEM_FULL,
> PSI_CPU_SOME,
> PSI_CPU_FULL,
> + PSI_UN_SOME,
> + PSI_UN_FULL,
> /* Only per-CPU, to weigh the CPU in the global average: */
> PSI_NONIDLE,
> - NR_PSI_STATES = 7,
> + NR_PSI_STATES = 9,
> };
>
> enum psi_aggregators {
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index a337f3e..a37b4a4 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -231,6 +231,10 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
> return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
> case PSI_CPU_FULL:
> return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
> + case PSI_UN_SOME:
> + return unlikely(tasks[NR_UNINTERRUPTIBLE]);
> + case PSI_UN_FULL:
> + return unlikely(tasks[NR_UNINTERRUPTIBLE] && !tasks[NR_RUNNING]);
> case PSI_NONIDLE:
> return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
> tasks[NR_RUNNING];
> @@ -683,6 +687,12 @@ static void record_times(struct psi_group_cpu *groupc, u64 now)
> groupc->times[PSI_CPU_FULL] += delta;
> }
>
> + if (groupc->state_mask & (1 << PSI_UN_SOME)) {
> + groupc->times[PSI_UN_SOME] += delta;
> + if (groupc->state_mask & (1 << PSI_UN_FULL))
> + groupc->times[PSI_UN_FULL] += delta;
> + }
> +
> if (groupc->state_mask & (1 << PSI_NONIDLE))
> groupc->times[PSI_NONIDLE] += delta;
> }
> diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
> index baa839c..bf98829 100644
> --- a/kernel/sched/stats.h
> +++ b/kernel/sched/stats.h
> @@ -132,6 +132,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
> if (p->in_iowait)
> clear |= TSK_IOWAIT;
> }
> + clear |= TSK_UNINTERRUPTIBLE;
>
> psi_task_change(p, clear, set);
> }
> @@ -139,6 +140,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
> static inline void psi_dequeue(struct task_struct *p, bool sleep)
> {
> int clear = TSK_RUNNING;
> + int set = 0;
>
> if (static_branch_likely(&psi_disabled))
> return;
> @@ -154,8 +156,10 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
>
> if (p->in_memstall)
> clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
> + if (READ_ONCE(p->__state) & TASK_UNINTERRUPTIBLE)
> + set = TSK_UNINTERRUPTIBLE;
>
> - psi_task_change(p, clear, 0);
> + psi_task_change(p, clear, set);
> }
>
> static inline void psi_ttwu_dequeue(struct task_struct *p)
On Mon, Aug 8, 2022 at 5:12 PM Chengming Zhou
<[email protected]> wrote:
>
> On 2022/8/8 14:13, zhaoyang.huang wrote:
> > From: Zhaoyang Huang <[email protected]>
> >
> > Uninterruptible sleep has not been monitored as an important system status yet.
> > Imagin that a set of psi triggers are created for monitoring a special group, while
> > get nothing high for none of the pressures, which could be the processes within
> > are stock in some given resources and turn to be UN status. Introduce PSI_UN as
> > a sub-type among PSI system here.
>
> Hello,
>
> The problem is that not all TASK_UNINTERRUPTIBLE task means stalled on some
> shared resource, like many schedule_timeout() paths.
Thanks for heads up. The aim is to distinguish where the processes go
(on or off cpu? waiting for anything?). It could be deemed as PSI_MEM
like property which is not a precise running time for memory things.
Furthermore, we do have method to make it be precise.
>
> Thanks.
>
> >
> > Signed-off-by: Zhaoyang Huang <[email protected]>
> > ---
> > include/linux/psi_types.h | 11 ++++++++---
> > kernel/sched/psi.c | 10 ++++++++++
> > kernel/sched/stats.h | 6 +++++-
> > 3 files changed, 23 insertions(+), 4 deletions(-)
> >
> > diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
> > index c7fe7c0..8cc1979 100644
> > --- a/include/linux/psi_types.h
> > +++ b/include/linux/psi_types.h
> > @@ -15,6 +15,7 @@ enum psi_task_count {
> > NR_IOWAIT,
> > NR_MEMSTALL,
> > NR_RUNNING,
> > + NR_UNINTERRUPTIBLE,
> > /*
> > * This can't have values other than 0 or 1 and could be
> > * implemented as a bit flag. But for now we still have room
> > @@ -32,7 +33,7 @@ enum psi_task_count {
> > * threads and memstall ones.
> > */
> > NR_MEMSTALL_RUNNING,
> > - NR_PSI_TASK_COUNTS = 5,
> > + NR_PSI_TASK_COUNTS = 6,
> > };
> >
> > /* Task state bitmasks */
> > @@ -41,13 +42,15 @@ enum psi_task_count {
> > #define TSK_RUNNING (1 << NR_RUNNING)
> > #define TSK_ONCPU (1 << NR_ONCPU)
> > #define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)
> > +#define TSK_UNINTERRUPTIBLE (1 << NR_UNINTERRUPTIBLE)
> >
> > /* Resources that workloads could be stalled on */
> > enum psi_res {
> > PSI_IO,
> > PSI_MEM,
> > PSI_CPU,
> > - NR_PSI_RESOURCES = 3,
> > + PSI_UN,
> > + NR_PSI_RESOURCES = 4,
> > };
> >
> > /*
> > @@ -63,9 +66,11 @@ enum psi_states {
> > PSI_MEM_FULL,
> > PSI_CPU_SOME,
> > PSI_CPU_FULL,
> > + PSI_UN_SOME,
> > + PSI_UN_FULL,
> > /* Only per-CPU, to weigh the CPU in the global average: */
> > PSI_NONIDLE,
> > - NR_PSI_STATES = 7,
> > + NR_PSI_STATES = 9,
> > };
> >
> > enum psi_aggregators {
> > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > index a337f3e..a37b4a4 100644
> > --- a/kernel/sched/psi.c
> > +++ b/kernel/sched/psi.c
> > @@ -231,6 +231,10 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
> > return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
> > case PSI_CPU_FULL:
> > return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
> > + case PSI_UN_SOME:
> > + return unlikely(tasks[NR_UNINTERRUPTIBLE]);
> > + case PSI_UN_FULL:
> > + return unlikely(tasks[NR_UNINTERRUPTIBLE] && !tasks[NR_RUNNING]);
> > case PSI_NONIDLE:
> > return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
> > tasks[NR_RUNNING];
> > @@ -683,6 +687,12 @@ static void record_times(struct psi_group_cpu *groupc, u64 now)
> > groupc->times[PSI_CPU_FULL] += delta;
> > }
> >
> > + if (groupc->state_mask & (1 << PSI_UN_SOME)) {
> > + groupc->times[PSI_UN_SOME] += delta;
> > + if (groupc->state_mask & (1 << PSI_UN_FULL))
> > + groupc->times[PSI_UN_FULL] += delta;
> > + }
> > +
> > if (groupc->state_mask & (1 << PSI_NONIDLE))
> > groupc->times[PSI_NONIDLE] += delta;
> > }
> > diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
> > index baa839c..bf98829 100644
> > --- a/kernel/sched/stats.h
> > +++ b/kernel/sched/stats.h
> > @@ -132,6 +132,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
> > if (p->in_iowait)
> > clear |= TSK_IOWAIT;
> > }
> > + clear |= TSK_UNINTERRUPTIBLE;
> >
> > psi_task_change(p, clear, set);
> > }
> > @@ -139,6 +140,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
> > static inline void psi_dequeue(struct task_struct *p, bool sleep)
> > {
> > int clear = TSK_RUNNING;
> > + int set = 0;
> >
> > if (static_branch_likely(&psi_disabled))
> > return;
> > @@ -154,8 +156,10 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
> >
> > if (p->in_memstall)
> > clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
> > + if (READ_ONCE(p->__state) & TASK_UNINTERRUPTIBLE)
> > + set = TSK_UNINTERRUPTIBLE;
> >
> > - psi_task_change(p, clear, 0);
> > + psi_task_change(p, clear, set);
> > }
> >
> > static inline void psi_ttwu_dequeue(struct task_struct *p)