2020-02-22 14:47:46

by Yafang Shao

[permalink] [raw]
Subject: [PATCH] psi: move PF_MEMSTALL into psi specific psi_flags

The task->flags is a 32-bits flag, in which 31 bits have already been
consumed. So it is hardly to introduce other new per process flag.
As there's a psi specific flag psi_flags, we'd better move the psi specific
per process flag PF_MEMSTALL into it.

Signed-off-by: Yafang Shao <[email protected]>
---
include/linux/psi_types.h | 12 +++++++++++-
include/linux/sched.h | 7 +++++--
kernel/sched/psi.c | 15 ++++++++-------
kernel/sched/stats.h | 10 +++++-----
4 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 07aaf9b82241..411dbbf57d51 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -17,11 +17,21 @@ enum psi_task_count {
NR_PSI_TASK_COUNTS = 3,
};

-/* Task state bitmasks */
+/*
+ * Task state bitmasks:
+ * These flags are stored in the lower PSI_TSK_BITS bits of
+ * task->psi_flags, and the higher bits are set with per process flag which
+ * persists across sleeps.
+ */
+#define PSI_TSK_STATE_BITS 16
+#define PSI_TSK_STATE_MASK ((1 << PSI_TSK_STATE_BITS) - 1)
#define TSK_IOWAIT (1 << NR_IOWAIT)
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
#define TSK_RUNNING (1 << NR_RUNNING)

+/* Stalled due to lack of memory, that's per process flag. */
+#define PSI_PF_MEMSTALL (1 << PSI_TSK_STATE_BITS)
+
/* Resources that workloads could be stalled on */
enum psi_res {
PSI_IO,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f314790cb527..2d4c04d35d9b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1025,7 +1025,11 @@ struct task_struct {

struct task_io_accounting ioac;
#ifdef CONFIG_PSI
- /* Pressure stall state */
+ /*
+ * Pressure stall state:
+ * Bits 0 ~ PSI_TSK_STATE_BITS-1: PSI task states
+ * Bits PSI_TSK_STATE_BITS ~ 31: Per process flags
+ */
unsigned int psi_flags;
#endif
#ifdef CONFIG_TASK_XACCT
@@ -1490,7 +1494,6 @@ extern struct pid *cad_pid;
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
-#define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */
#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 028520702717..34363fc77ecc 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -759,7 +759,8 @@ void psi_task_change(struct task_struct *task, int clear, int set)
!psi_bug) {
printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
task->pid, task->comm, cpu,
- task->psi_flags, clear, set);
+ task->psi_flags & PSI_TSK_STATE_MASK,
+ clear, set);
psi_bug = 1;
}

@@ -818,17 +819,17 @@ void psi_memstall_enter(unsigned long *flags)
if (static_branch_likely(&psi_disabled))
return;

- *flags = current->flags & PF_MEMSTALL;
+ *flags = current->psi_flags & PSI_PF_MEMSTALL;
if (*flags)
return;
/*
- * PF_MEMSTALL setting & accounting needs to be atomic wrt
+ * PSI_PF_MEMSTALL setting & accounting needs to be atomic wrt
* changes to the task's scheduling state, otherwise we can
* race with CPU migration.
*/
rq = this_rq_lock_irq(&rf);

- current->flags |= PF_MEMSTALL;
+ current->psi_flags |= PSI_PF_MEMSTALL;
psi_task_change(current, 0, TSK_MEMSTALL);

rq_unlock_irq(rq, &rf);
@@ -851,13 +852,13 @@ void psi_memstall_leave(unsigned long *flags)
if (*flags)
return;
/*
- * PF_MEMSTALL clearing & accounting needs to be atomic wrt
+ * PSI_PF_MEMSTALL clearing & accounting needs to be atomic wrt
* changes to the task's scheduling state, otherwise we could
* race with CPU migration.
*/
rq = this_rq_lock_irq(&rf);

- current->flags &= ~PF_MEMSTALL;
+ current->psi_flags &= ~PSI_PF_MEMSTALL;
psi_task_change(current, TSK_MEMSTALL, 0);

rq_unlock_irq(rq, &rf);
@@ -921,7 +922,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
else if (task->in_iowait)
task_flags = TSK_IOWAIT;

- if (task->flags & PF_MEMSTALL)
+ if (task->psi_flags & PSI_PF_MEMSTALL)
task_flags |= TSK_MEMSTALL;

if (task_flags)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index ba683fe81a6e..164f97b1ce7f 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -70,7 +70,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
return;

if (!wakeup || p->sched_psi_wake_requeue) {
- if (p->flags & PF_MEMSTALL)
+ if (p->psi_flags & PSI_PF_MEMSTALL)
set |= TSK_MEMSTALL;
if (p->sched_psi_wake_requeue)
p->sched_psi_wake_requeue = 0;
@@ -90,7 +90,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
return;

if (!sleep) {
- if (p->flags & PF_MEMSTALL)
+ if (p->psi_flags & PSI_PF_MEMSTALL)
clear |= TSK_MEMSTALL;
} else {
if (p->in_iowait)
@@ -109,14 +109,14 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
* deregister its sleep-persistent psi states from the old
* queue, and let psi_enqueue() know it has to requeue.
*/
- if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) {
+ if (unlikely(p->in_iowait || (p->psi_flags & PSI_PF_MEMSTALL))) {
struct rq_flags rf;
struct rq *rq;
int clear = 0;

if (p->in_iowait)
clear |= TSK_IOWAIT;
- if (p->flags & PF_MEMSTALL)
+ if (p->psi_flags & PSI_PF_MEMSTALL)
clear |= TSK_MEMSTALL;

rq = __task_rq_lock(p, &rf);
@@ -131,7 +131,7 @@ static inline void psi_task_tick(struct rq *rq)
if (static_branch_likely(&psi_disabled))
return;

- if (unlikely(rq->curr->flags & PF_MEMSTALL))
+ if (unlikely(rq->curr->psi_flags & PSI_PF_MEMSTALL))
psi_memstall_tick(rq->curr, cpu_of(rq));
}
#else /* CONFIG_PSI */
--
Yafang Shao
DiDi


2020-02-24 16:26:40

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH] psi: move PF_MEMSTALL into psi specific psi_flags

Hello Yafang,

On Sat, Feb 22, 2020 at 09:46:47AM -0500, Yafang Shao wrote:
> The task->flags is a 32-bits flag, in which 31 bits have already been
> consumed. So it is hardly to introduce other new per process flag.
> As there's a psi specific flag psi_flags, we'd better move the psi specific
> per process flag PF_MEMSTALL into it.

Currently, psi_flags is used only for debugging:

if (((task->psi_flags & set) ||
(task->psi_flags & clear) != clear) &&
!psi_bug) {
printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
task->pid, task->comm, cpu,
task->psi_flags, clear, set);
psi_bug = 1;
}

task->psi_flags &= ~clear;
task->psi_flags |= set;

While this has caught a few bugs while the code was new, I'm planning
on moving it to a CONFIG option that is only enabled in debug builds.

If you need the room in task->flags, can you please make the memstall
state a single bit in task_struct instead? AFAICS there is still space
in this section:

/* Force alignment to the next boundary: */
unsigned :0;

/* Unserialized, strictly 'current' */

...

#ifdef CONFIG_PSI
unsigned in_memstall:1;
#endif

It would also avoid the mixed-bit masking headache:

> @@ -17,11 +17,21 @@ enum psi_task_count {
> NR_PSI_TASK_COUNTS = 3,
> };
>
> -/* Task state bitmasks */
> +/*
> + * Task state bitmasks:
> + * These flags are stored in the lower PSI_TSK_BITS bits of
> + * task->psi_flags, and the higher bits are set with per process flag which
> + * persists across sleeps.
> + */
> +#define PSI_TSK_STATE_BITS 16
> +#define PSI_TSK_STATE_MASK ((1 << PSI_TSK_STATE_BITS) - 1)
> #define TSK_IOWAIT (1 << NR_IOWAIT)
> #define TSK_MEMSTALL (1 << NR_MEMSTALL)
> #define TSK_RUNNING (1 << NR_RUNNING)
>
> +/* Stalled due to lack of memory, that's per process flag. */
> +#define PSI_PF_MEMSTALL (1 << PSI_TSK_STATE_BITS)
> +
> /* Resources that workloads could be stalled on */
> enum psi_res {
> PSI_IO,
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index f314790cb527..2d4c04d35d9b 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1025,7 +1025,11 @@ struct task_struct {
>
> struct task_io_accounting ioac;
> #ifdef CONFIG_PSI
> - /* Pressure stall state */
> + /*
> + * Pressure stall state:
> + * Bits 0 ~ PSI_TSK_STATE_BITS-1: PSI task states
> + * Bits PSI_TSK_STATE_BITS ~ 31: Per process flags
> + */
> unsigned int psi_flags;
> #endif
> #ifdef CONFIG_TASK_XACCT

Thanks

2020-02-26 14:24:27

by Yafang Shao

[permalink] [raw]
Subject: Re: [PATCH] psi: move PF_MEMSTALL into psi specific psi_flags

On Tue, Feb 25, 2020 at 12:25 AM Johannes Weiner <[email protected]> wrote:
>
> Hello Yafang,
>
> On Sat, Feb 22, 2020 at 09:46:47AM -0500, Yafang Shao wrote:
> > The task->flags is a 32-bits flag, in which 31 bits have already been
> > consumed. So it is hardly to introduce other new per process flag.
> > As there's a psi specific flag psi_flags, we'd better move the psi specific
> > per process flag PF_MEMSTALL into it.
>
> Currently, psi_flags is used only for debugging:
>
> if (((task->psi_flags & set) ||
> (task->psi_flags & clear) != clear) &&
> !psi_bug) {
> printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
> task->pid, task->comm, cpu,
> task->psi_flags, clear, set);
> psi_bug = 1;
> }
>
> task->psi_flags &= ~clear;
> task->psi_flags |= set;
>
> While this has caught a few bugs while the code was new, I'm planning
> on moving it to a CONFIG option that is only enabled in debug builds.
>

Got it. Many thanks for you explanation.

> If you need the room in task->flags, can you please make the memstall
> state a single bit in task_struct instead? AFAICS there is still space
> in this section:
>
> /* Force alignment to the next boundary: */
> unsigned :0;
>
> /* Unserialized, strictly 'current' */
>
> ...
>
> #ifdef CONFIG_PSI
> unsigned in_memstall:1;
> #endif
>
> It would also avoid the mixed-bit masking headache:
>

Seems that's a better solution. I will update with it.
Thanks for your suggestion.

> > @@ -17,11 +17,21 @@ enum psi_task_count {
> > NR_PSI_TASK_COUNTS = 3,
> > };
> >
> > -/* Task state bitmasks */
> > +/*
> > + * Task state bitmasks:
> > + * These flags are stored in the lower PSI_TSK_BITS bits of
> > + * task->psi_flags, and the higher bits are set with per process flag which
> > + * persists across sleeps.
> > + */
> > +#define PSI_TSK_STATE_BITS 16
> > +#define PSI_TSK_STATE_MASK ((1 << PSI_TSK_STATE_BITS) - 1)
> > #define TSK_IOWAIT (1 << NR_IOWAIT)
> > #define TSK_MEMSTALL (1 << NR_MEMSTALL)
> > #define TSK_RUNNING (1 << NR_RUNNING)
> >
> > +/* Stalled due to lack of memory, that's per process flag. */
> > +#define PSI_PF_MEMSTALL (1 << PSI_TSK_STATE_BITS)
> > +
> > /* Resources that workloads could be stalled on */
> > enum psi_res {
> > PSI_IO,
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index f314790cb527..2d4c04d35d9b 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -1025,7 +1025,11 @@ struct task_struct {
> >
> > struct task_io_accounting ioac;
> > #ifdef CONFIG_PSI
> > - /* Pressure stall state */
> > + /*
> > + * Pressure stall state:
> > + * Bits 0 ~ PSI_TSK_STATE_BITS-1: PSI task states
> > + * Bits PSI_TSK_STATE_BITS ~ 31: Per process flags
> > + */
> > unsigned int psi_flags;
> > #endif
> > #ifdef CONFIG_TASK_XACCT
>
> Thanks



--
Yafang Shao
DiDi