2021-03-21 22:52:07

by Shakeel Butt

[permalink] [raw]
Subject: [PATCH] psi: reduce calls to sched_clock() in psi

We noticed that the cost of psi increases with the increase in the
levels of the cgroups. Particularly the cost of cpu_clock() sticks out
as the kernel calls it multiple times as it traverses up the cgroup
tree. This patch reduces the calls to cpu_clock().

Performed perf bench on Intel Broadwell with 3 levels of cgroup.

Before the patch:

$ perf bench sched all
# Running sched/messaging benchmark...
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

Total time: 0.747 [sec]

# Running sched/pipe benchmark...
# Executed 1000000 pipe operations between two processes

Total time: 3.516 [sec]

3.516689 usecs/op
284358 ops/sec

After the patch:

$ perf bench sched all
# Running sched/messaging benchmark...
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

Total time: 0.640 [sec]

# Running sched/pipe benchmark...
# Executed 1000000 pipe operations between two processes

Total time: 3.329 [sec]

3.329820 usecs/op
300316 ops/sec

Signed-off-by: Shakeel Butt <[email protected]>
---
kernel/sched/psi.c | 19 ++++++++++---------
1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index ee3c5b48622f..16348b269713 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -644,12 +644,10 @@ static void poll_timer_fn(struct timer_list *t)
wake_up_interruptible(&group->poll_wait);
}

-static void record_times(struct psi_group_cpu *groupc, int cpu)
+static void record_times(struct psi_group_cpu *groupc, u64 now)
{
u32 delta;
- u64 now;

- now = cpu_clock(cpu);
delta = now - groupc->state_start;
groupc->state_start = now;

@@ -676,7 +674,7 @@ static void record_times(struct psi_group_cpu *groupc, int cpu)
}

static void psi_group_change(struct psi_group *group, int cpu,
- unsigned int clear, unsigned int set,
+ unsigned int clear, unsigned int set, u64 now,
bool wake_clock)
{
struct psi_group_cpu *groupc;
@@ -696,7 +694,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
*/
write_seqcount_begin(&groupc->seq);

- record_times(groupc, cpu);
+ record_times(groupc, now);

for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
if (!(m & (1 << t)))
@@ -788,12 +786,14 @@ void psi_task_change(struct task_struct *task, int clear, int set)
struct psi_group *group;
bool wake_clock = true;
void *iter = NULL;
+ u64 now;

if (!task->pid)
return;

psi_flags_change(task, clear, set);

+ now = cpu_clock(cpu);
/*
* Periodic aggregation shuts off if there is a period of no
* task changes, so we wake it back up if necessary. However,
@@ -806,7 +806,7 @@ void psi_task_change(struct task_struct *task, int clear, int set)
wake_clock = false;

while ((group = iterate_groups(task, &iter)))
- psi_group_change(group, cpu, clear, set, wake_clock);
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
}

void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@@ -815,6 +815,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
struct psi_group *group, *common = NULL;
int cpu = task_cpu(prev);
void *iter;
+ u64 now = cpu_clock(cpu);

if (next->pid) {
bool identical_state;
@@ -836,7 +837,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
break;
}

- psi_group_change(group, cpu, 0, TSK_ONCPU, true);
+ psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
}
}

@@ -858,7 +859,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,

iter = NULL;
while ((group = iterate_groups(prev, &iter)) && group != common)
- psi_group_change(group, cpu, clear, set, true);
+ psi_group_change(group, cpu, clear, set, now, true);

/*
* TSK_ONCPU is handled up to the common ancestor. If we're tasked
@@ -867,7 +868,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
if (sleep) {
clear &= ~TSK_ONCPU;
for (; group; group = iterate_groups(prev, &iter))
- psi_group_change(group, cpu, clear, set, true);
+ psi_group_change(group, cpu, clear, set, now, true);
}
}
}
--
2.31.0.291.g576ba9dcdaf-goog


2021-03-22 07:49:05

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] psi: reduce calls to sched_clock() in psi

On Sun, Mar 21, 2021 at 01:51:56PM -0700, Shakeel Butt wrote:
> We noticed that the cost of psi increases with the increase in the
> levels of the cgroups. Particularly the cost of cpu_clock() sticks out
> as the kernel calls it multiple times as it traverses up the cgroup
> tree. This patch reduces the calls to cpu_clock().
>
> Performed perf bench on Intel Broadwell with 3 levels of cgroup.
>
> Before the patch:
>
> $ perf bench sched all
> # Running sched/messaging benchmark...
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
>
> Total time: 0.747 [sec]
>
> # Running sched/pipe benchmark...
> # Executed 1000000 pipe operations between two processes
>
> Total time: 3.516 [sec]
>
> 3.516689 usecs/op
> 284358 ops/sec
>
> After the patch:
>
> $ perf bench sched all
> # Running sched/messaging benchmark...
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
>
> Total time: 0.640 [sec]
>
> # Running sched/pipe benchmark...
> # Executed 1000000 pipe operations between two processes
>
> Total time: 3.329 [sec]
>
> 3.329820 usecs/op
> 300316 ops/sec
>
> Signed-off-by: Shakeel Butt <[email protected]>

Thanks!

2021-03-22 14:21:14

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH] psi: reduce calls to sched_clock() in psi

On Sun, Mar 21, 2021 at 01:51:56PM -0700, Shakeel Butt wrote:
> We noticed that the cost of psi increases with the increase in the
> levels of the cgroups. Particularly the cost of cpu_clock() sticks out
> as the kernel calls it multiple times as it traverses up the cgroup
> tree. This patch reduces the calls to cpu_clock().
>
> Performed perf bench on Intel Broadwell with 3 levels of cgroup.
>
> Before the patch:
>
> $ perf bench sched all
> # Running sched/messaging benchmark...
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
>
> Total time: 0.747 [sec]
>
> # Running sched/pipe benchmark...
> # Executed 1000000 pipe operations between two processes
>
> Total time: 3.516 [sec]
>
> 3.516689 usecs/op
> 284358 ops/sec
>
> After the patch:
>
> $ perf bench sched all
> # Running sched/messaging benchmark...
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
>
> Total time: 0.640 [sec]
>
> # Running sched/pipe benchmark...
> # Executed 1000000 pipe operations between two processes
>
> Total time: 3.329 [sec]
>
> 3.329820 usecs/op
> 300316 ops/sec
>
> Signed-off-by: Shakeel Butt <[email protected]>

Acked-by: Johannes Weiner <[email protected]>

Subject: [tip: sched/core] psi: Reduce calls to sched_clock() in psi

The following commit has been merged into the sched/core branch of tip:

Commit-ID: df77430639c9cf73559bac0f25084518bf9a812d
Gitweb: https://git.kernel.org/tip/df77430639c9cf73559bac0f25084518bf9a812d
Author: Shakeel Butt <[email protected]>
AuthorDate: Sun, 21 Mar 2021 13:51:56 -07:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Tue, 23 Mar 2021 16:01:58 +01:00

psi: Reduce calls to sched_clock() in psi

We noticed that the cost of psi increases with the increase in the
levels of the cgroups. Particularly the cost of cpu_clock() sticks out
as the kernel calls it multiple times as it traverses up the cgroup
tree. This patch reduces the calls to cpu_clock().

Performed perf bench on Intel Broadwell with 3 levels of cgroup.

Before the patch:

$ perf bench sched all
# Running sched/messaging benchmark...
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

Total time: 0.747 [sec]

# Running sched/pipe benchmark...
# Executed 1000000 pipe operations between two processes

Total time: 3.516 [sec]

3.516689 usecs/op
284358 ops/sec

After the patch:

$ perf bench sched all
# Running sched/messaging benchmark...
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

Total time: 0.640 [sec]

# Running sched/pipe benchmark...
# Executed 1000000 pipe operations between two processes

Total time: 3.329 [sec]

3.329820 usecs/op
300316 ops/sec

Signed-off-by: Shakeel Butt <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Acked-by: Johannes Weiner <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/sched/psi.c | 19 ++++++++++---------
1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index c8480d7..b1b00e9 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -644,12 +644,10 @@ static void poll_timer_fn(struct timer_list *t)
wake_up_interruptible(&group->poll_wait);
}

-static void record_times(struct psi_group_cpu *groupc, int cpu)
+static void record_times(struct psi_group_cpu *groupc, u64 now)
{
u32 delta;
- u64 now;

- now = cpu_clock(cpu);
delta = now - groupc->state_start;
groupc->state_start = now;

@@ -676,7 +674,7 @@ static void record_times(struct psi_group_cpu *groupc, int cpu)
}

static void psi_group_change(struct psi_group *group, int cpu,
- unsigned int clear, unsigned int set,
+ unsigned int clear, unsigned int set, u64 now,
bool wake_clock)
{
struct psi_group_cpu *groupc;
@@ -696,7 +694,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
*/
write_seqcount_begin(&groupc->seq);

- record_times(groupc, cpu);
+ record_times(groupc, now);

for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
if (!(m & (1 << t)))
@@ -788,12 +786,14 @@ void psi_task_change(struct task_struct *task, int clear, int set)
struct psi_group *group;
bool wake_clock = true;
void *iter = NULL;
+ u64 now;

if (!task->pid)
return;

psi_flags_change(task, clear, set);

+ now = cpu_clock(cpu);
/*
* Periodic aggregation shuts off if there is a period of no
* task changes, so we wake it back up if necessary. However,
@@ -806,7 +806,7 @@ void psi_task_change(struct task_struct *task, int clear, int set)
wake_clock = false;

while ((group = iterate_groups(task, &iter)))
- psi_group_change(group, cpu, clear, set, wake_clock);
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
}

void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@@ -815,6 +815,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
struct psi_group *group, *common = NULL;
int cpu = task_cpu(prev);
void *iter;
+ u64 now = cpu_clock(cpu);

if (next->pid) {
bool identical_state;
@@ -836,7 +837,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
break;
}

- psi_group_change(group, cpu, 0, TSK_ONCPU, true);
+ psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
}
}

@@ -858,7 +859,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,

iter = NULL;
while ((group = iterate_groups(prev, &iter)) && group != common)
- psi_group_change(group, cpu, clear, set, true);
+ psi_group_change(group, cpu, clear, set, now, true);

/*
* TSK_ONCPU is handled up to the common ancestor. If we're tasked
@@ -867,7 +868,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
if (sleep) {
clear &= ~TSK_ONCPU;
for (; group; group = iterate_groups(prev, &iter))
- psi_group_change(group, cpu, clear, set, true);
+ psi_group_change(group, cpu, clear, set, now, true);
}
}
}