2013-07-01 07:12:00

by Lei Wen

[permalink] [raw]
Subject: [PATCH 0/2] sched: add trace event for per-entity tracking

Thanks for the per-entity tracking feature, we could know the details of
each task by its help.
This patch add its trace support, so that we could quickly know the system
status in a large time scale, like now we may get each runqueue's usage ratio by:

cfs_rq's usage ratio = cfs_rq->runnable_load_avg/cfs_rq->load.weight

Lei Wen (2):
sched: add trace events for task and rq usage tracking
sched: update cfs_rq weight earlier in enqueue_entity

include/trace/events/sched.h | 73 ++++++++++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 31 ++++++++++++++++--
2 files changed, 101 insertions(+), 3 deletions(-)

--
1.7.10.4


2013-07-01 07:11:20

by Lei Wen

[permalink] [raw]
Subject: [PATCH 1/2] sched: add trace events for task and rq usage tracking

Since we could track task in the entity level now, we may want to
investigate tasks' running status by recording the trace info, so that
could make some tuning if needed.

Signed-off-by: Lei Wen <[email protected]>
---
include/trace/events/sched.h | 73 ++++++++++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 29 +++++++++++++++--
2 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index e5586ca..8f1af65 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -430,6 +430,79 @@ TRACE_EVENT(sched_pi_setprio,
__entry->oldprio, __entry->newprio)
);

+TRACE_EVENT(sched_task_weighted_load,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long load, unsigned long weight),
+
+ TP_ARGS(tsk, load, weight),
+
+ TP_STRUCT__entry(
+ __field(pid_t, pid)
+ __field(int, cpu)
+ __field(unsigned long, load)
+ __field(unsigned long, weight)
+ ),
+
+ TP_fast_assign(
+ __entry->pid = tsk->pid;
+ __entry->cpu = task_thread_info(tsk)->cpu;
+ __entry->load = load;
+ __entry->weight= weight;
+ ),
+
+ TP_printk("cpu=%d pid=%d load=%lu weight=%lu",
+ __entry->cpu, __entry->pid,
+ __entry->load, __entry->weight)
+);
+
+TRACE_EVENT(sched_cfs_rq_runnable_load,
+
+ TP_PROTO(int cpu, unsigned long load, unsigned long total),
+
+ TP_ARGS(cpu, load, total),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ __field(unsigned long, load)
+ __field(unsigned long, total)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->load = load;
+ __entry->total = total;
+ ),
+
+ TP_printk("cpu=%d avg=%lu total=%lu",
+ __entry->cpu,
+ __entry->load,
+ __entry->total)
+);
+
+TRACE_EVENT(sched_cfs_rq_blocked_load,
+
+ TP_PROTO(int cpu, unsigned long load, unsigned long total),
+
+ TP_ARGS(cpu, load, total),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ __field(unsigned long, load)
+ __field(unsigned long, total)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->load = load;
+ __entry->total = total;
+ ),
+
+ TP_printk("cpu=%d avg=%lu total=%lu",
+ __entry->cpu,
+ __entry->load,
+ __entry->total)
+);
+
#endif /* _TRACE_SCHED_H */

/* This part must be outside protection */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f77f9c5..07bd74c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1346,6 +1346,7 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
return 0;

se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+ trace_sched_task_weighted_load(task_of(se), se->avg.load_avg_contrib, se->load.weight);
se->avg.decay_count = 0;

return decays;
@@ -1445,6 +1446,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
contrib /= (se->avg.runnable_avg_period + 1);
se->avg.load_avg_contrib = scale_load(contrib);
+ trace_sched_task_weighted_load(task_of(se), se->avg.load_avg_contrib, se->load.weight);
}

/* Compute the current contribution to load_avg by se, return any delta */
@@ -1498,10 +1500,16 @@ static inline void update_entity_load_avg(struct sched_entity *se,
if (!update_cfs_rq)
return;

- if (se->on_rq)
+ if (se->on_rq) {
cfs_rq->runnable_load_avg += contrib_delta;
- else
+ trace_sched_cfs_rq_runnable_load(cpu_of(rq_of(cfs_rq)),
+ cfs_rq->runnable_load_avg, cfs_rq->load.weight);
+ } else {
subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+ trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+ cfs_rq->blocked_load_avg,
+ cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
+ }
}

/*
@@ -1531,6 +1539,9 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
}

__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+ trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+ cfs_rq->blocked_load_avg,
+ cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
}

static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
@@ -1584,10 +1595,15 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
/* migrated tasks did not contribute to our blocked load */
if (wakeup) {
subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+ trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+ cfs_rq->blocked_load_avg,
+ cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
update_entity_load_avg(se, 0);
}

cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+ trace_sched_cfs_rq_runnable_load(cpu_of(rq_of(cfs_rq)),
+ cfs_rq->runnable_load_avg, cfs_rq->load.weight);
/* we force update consideration on load-balancer moves */
update_cfs_rq_blocked_load(cfs_rq, !wakeup);
}
@@ -1608,6 +1624,9 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
if (sleep) {
cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+ trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+ cfs_rq->blocked_load_avg,
+ cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
} /* migrations, e.g. sleep=0 leave decay_count == 0 */
}
@@ -5894,6 +5913,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
__synchronize_entity_decay(&p->se);
subtract_blocked_load_contrib(cfs_rq,
p->se.avg.load_avg_contrib);
+ trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+ cfs_rq->blocked_load_avg,
+ cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
}
#endif
}
@@ -5994,6 +6016,9 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
*/
p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+ trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+ cfs_rq->blocked_load_avg,
+ cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
#endif
}
}
--
1.7.10.4

2013-07-01 07:14:36

by Lei Wen

[permalink] [raw]
Subject: [PATCH 2/2] sched: update cfs_rq weight earlier in enqueue_entity

Since we are going to calculate cfs_rq's average ratio by
runnable_load_avg/load.weight, if not increase the load.weight prior to
enqueue_entity_load_avg, it may lead to one cfs_rq's avg ratio higher
than 100%.

Adjust the sequence, so that all ratio is kept below 100%.

Signed-off-by: Lei Wen <[email protected]>
---
kernel/sched/fair.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 07bd74c..d1eee84 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1788,8 +1788,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
account_entity_enqueue(cfs_rq, se);
+ enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
update_cfs_shares(cfs_rq);

if (flags & ENQUEUE_WAKEUP) {
--
1.7.10.4

2013-07-01 08:07:38

by Alex Shi

[permalink] [raw]
Subject: Re: [PATCH 0/2] sched: add trace event for per-entity tracking

On 07/01/2013 03:10 PM, Lei Wen wrote:
> Thanks for the per-entity tracking feature, we could know the details of
> each task by its help.
> This patch add its trace support, so that we could quickly know the system
> status in a large time scale, like now we may get each runqueue's usage ratio by:
>
> cfs_rq's usage ratio = cfs_rq->runnable_load_avg/cfs_rq->load.weight
>

the direct usage ratio is rq.avg.runnable_avg_sum / rq.avg.runnable_avg_period.

one patch from obsolete power-scheduling could be reference for this:
[email protected]:alexshi/power-scheduling.git power-scheduling

>From 081cd4bcbccfaa1930b031e4dfbf9d23b8c0d5ab Mon Sep 17 00:00:00 2001
From: Alex Shi <[email protected]>
Date: Fri, 7 Dec 2012 21:37:58 +0800
Subject: [PATCH 02/23] sched: log the cpu utilization at rq

The cpu's utilization is to measure how busy is the cpu.
util = cpu_rq(cpu)->avg.runnable_avg_sum * SCHED_POEWR_SCALE
/ cpu_rq(cpu)->avg.runnable_avg_period;

Since the util is no more than 1, we scale its value with 1024, same as
SCHED_POWER_SCALE and set the FULL_UTIL as 1024.

In later power aware scheduling, we are sensitive for how busy of the
cpu. Since as to power consuming, it is tight related with cpu busy
time.

BTW, rq->util can be used for any purposes if needed, not only power
scheduling.

Signed-off-by: Alex Shi <[email protected]>
---
include/linux/sched.h | 2 +-
kernel/sched/debug.c | 1 +
kernel/sched/fair.c | 5 +++++
kernel/sched/sched.h | 4 ++++
4 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9539597..4e4d9ee 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -794,7 +794,7 @@ enum cpu_idle_type {
#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)

/*
- * Increase resolution of cpu_power calculations
+ * Increase resolution of cpu_power and rq->util calculations
*/
#define SCHED_POWER_SHIFT 10
#define SCHED_POWER_SCALE (1L << SCHED_POWER_SHIFT)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 75024a6..f5db759 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -311,6 +311,7 @@ do { \

P(ttwu_count);
P(ttwu_local);
+ P(util);

#undef P
#undef P64
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2e49c3f..7124244 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1495,8 +1495,13 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)

static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
{
+ u32 period;
__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
__update_tg_runnable_avg(&rq->avg, &rq->cfs);
+
+ period = rq->avg.runnable_avg_period ? rq->avg.runnable_avg_period : 1;
+ rq->util = (u64)(rq->avg.runnable_avg_sum << SCHED_POWER_SHIFT)
+ / period;
}

/* Add the load generated by se into cfs_rq's child load-average */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 804ee41..8682110 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -351,6 +351,9 @@ extern struct root_domain def_root_domain;

#endif /* CONFIG_SMP */

+/* full cpu utilization */
+#define FULL_UTIL SCHED_POWER_SCALE
+
/*
* This is the main, per-CPU runqueue data structure.
*
@@ -482,6 +485,7 @@ struct rq {
#endif

struct sched_avg avg;
+ unsigned int util;
};

static inline int cpu_of(struct rq *rq)
--
1.7.12

--
Thanks
Alex

2013-07-01 08:49:25

by Lei Wen

[permalink] [raw]
Subject: Re: [PATCH 0/2] sched: add trace event for per-entity tracking

Alex,

On Mon, Jul 1, 2013 at 4:06 PM, Alex Shi <[email protected]> wrote:
> On 07/01/2013 03:10 PM, Lei Wen wrote:
>> Thanks for the per-entity tracking feature, we could know the details of
>> each task by its help.
>> This patch add its trace support, so that we could quickly know the system
>> status in a large time scale, like now we may get each runqueue's usage ratio by:
>>
>> cfs_rq's usage ratio = cfs_rq->runnable_load_avg/cfs_rq->load.weight
>>
>
> the direct usage ratio is rq.avg.runnable_avg_sum / rq.avg.runnable_avg_period.


>From the parsed data diagram, seem more pretty than my previous use
load as the calculation one. :)
BTW, do you think there is some meaning for doing below calculation?
cfs_rq->runnable_load_avg/cfs_rq->load.weight

I think by this calculation from the
runnable_avg_load/blocked_avg_load trace result,
we may catch some abnormal load distribution when debugging.


>
> one patch from obsolete power-scheduling could be reference for this:
> [email protected]:alexshi/power-scheduling.git power-scheduling
>
> From 081cd4bcbccfaa1930b031e4dfbf9d23b8c0d5ab Mon Sep 17 00:00:00 2001
> From: Alex Shi <[email protected]>
> Date: Fri, 7 Dec 2012 21:37:58 +0800
> Subject: [PATCH 02/23] sched: log the cpu utilization at rq
>
> The cpu's utilization is to measure how busy is the cpu.
> util = cpu_rq(cpu)->avg.runnable_avg_sum * SCHED_POEWR_SCALE
> / cpu_rq(cpu)->avg.runnable_avg_period;
>
> Since the util is no more than 1, we scale its value with 1024, same as
> SCHED_POWER_SCALE and set the FULL_UTIL as 1024.
>
> In later power aware scheduling, we are sensitive for how busy of the
> cpu. Since as to power consuming, it is tight related with cpu busy
> time.
>
> BTW, rq->util can be used for any purposes if needed, not only power
> scheduling.
>
> Signed-off-by: Alex Shi <[email protected]>


Nice patch, would it be merged? :)

Thanks,
Lei
> ---
> include/linux/sched.h | 2 +-
> kernel/sched/debug.c | 1 +
> kernel/sched/fair.c | 5 +++++
> kernel/sched/sched.h | 4 ++++
> 4 files changed, 11 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 9539597..4e4d9ee 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -794,7 +794,7 @@ enum cpu_idle_type {
> #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
>
> /*
> - * Increase resolution of cpu_power calculations
> + * Increase resolution of cpu_power and rq->util calculations
> */
> #define SCHED_POWER_SHIFT 10
> #define SCHED_POWER_SCALE (1L << SCHED_POWER_SHIFT)
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 75024a6..f5db759 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -311,6 +311,7 @@ do { \
>
> P(ttwu_count);
> P(ttwu_local);
> + P(util);
>
> #undef P
> #undef P64
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 2e49c3f..7124244 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1495,8 +1495,13 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
>
> static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
> {
> + u32 period;
> __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
> __update_tg_runnable_avg(&rq->avg, &rq->cfs);
> +
> + period = rq->avg.runnable_avg_period ? rq->avg.runnable_avg_period : 1;
> + rq->util = (u64)(rq->avg.runnable_avg_sum << SCHED_POWER_SHIFT)
> + / period;
> }
>
> /* Add the load generated by se into cfs_rq's child load-average */
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 804ee41..8682110 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -351,6 +351,9 @@ extern struct root_domain def_root_domain;
>
> #endif /* CONFIG_SMP */
>
> +/* full cpu utilization */
> +#define FULL_UTIL SCHED_POWER_SCALE
> +
> /*
> * This is the main, per-CPU runqueue data structure.
> *
> @@ -482,6 +485,7 @@ struct rq {
> #endif
>
> struct sched_avg avg;
> + unsigned int util;
> };
>
> static inline int cpu_of(struct rq *rq)
> --
> 1.7.12
>
> --
> Thanks
> Alex
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2013-07-01 09:43:28

by Kamalesh Babulal

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched: add trace events for task and rq usage tracking

* Lei Wen <[email protected]> [2013-07-01 15:10:32]:

> Since we could track task in the entity level now, we may want to
> investigate tasks' running status by recording the trace info, so that
> could make some tuning if needed.
>
> Signed-off-by: Lei Wen <[email protected]>
> ---
> include/trace/events/sched.h | 73 ++++++++++++++++++++++++++++++++++++++++++
> kernel/sched/fair.c | 29 +++++++++++++++--
> 2 files changed, 100 insertions(+), 2 deletions(-)

[...]

>
> +TRACE_EVENT(sched_cfs_rq_runnable_load,
> +
> + TP_PROTO(int cpu, unsigned long load, unsigned long total),
> +
> + TP_ARGS(cpu, load, total),
> +
> + TP_STRUCT__entry(
> + __field(int, cpu)
> + __field(unsigned long, load)
> + __field(unsigned long, total)
> + ),
> +
> + TP_fast_assign(
> + __entry->cpu = cpu;
> + __entry->load = load;
> + __entry->total = total;
> + ),
> +
> + TP_printk("cpu=%d avg=%lu total=%lu",
> + __entry->cpu,
> + __entry->load,
> + __entry->total)
> +);
> +
> +TRACE_EVENT(sched_cfs_rq_blocked_load,
> +
> + TP_PROTO(int cpu, unsigned long load, unsigned long total),
> +
> + TP_ARGS(cpu, load, total),
> +
> + TP_STRUCT__entry(
> + __field(int, cpu)
> + __field(unsigned long, load)
> + __field(unsigned long, total)
> + ),
> +
> + TP_fast_assign(
> + __entry->cpu = cpu;
> + __entry->load = load;
> + __entry->total = total;
> + ),
> +
> + TP_printk("cpu=%d avg=%lu total=%lu",
> + __entry->cpu,
> + __entry->load,
> + __entry->total)
> +);
> +
> #endif /* _TRACE_SCHED_H */

above trace points are same and be folded using EVENT_CLASS:

+DECLARE_EVENT_CLASS(sched_cfs_rq_load_contri_template,
+
+ TP_PROTO(int cpu, unsigned long load, unsigned long total),
+
+ TP_ARGS(cpu, load, total),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ __field(unsigned long, load)
+ __field(unsigned long, total)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->load = load;
+ __entry->total = total;
+ ),
+
+ TP_printk("cpu=%d avg=%lu total=%lu",
+ __entry->cpu,
+ __entry->load,
+ __entry->total)
+);
+
+DEFINE_EVENT(sched_cfs_rq_load_contri_template, sched_cfs_rq_runnable_load,
+ TP_PROTO(int cpu, unsigned long load, unsigned long total),
+ TP_ARGS(cpu, load, total));
+
+DEFINE_EVENT(sched_cfs_rq_load_contri_template, sched_cfs_rq_blocked_load,
+ TP_PROTO(int cpu, unsigned long load, unsigned long total),
+ TP_ARGS(cpu, load, total));
+

2013-07-01 12:19:03

by Lei Wen

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched: add trace events for task and rq usage tracking

Hi Kamalesh,

On Mon, Jul 1, 2013 at 5:43 PM, Kamalesh Babulal
<[email protected]> wrote:
> * Lei Wen <[email protected]> [2013-07-01 15:10:32]:
>
>> Since we could track task in the entity level now, we may want to
>> investigate tasks' running status by recording the trace info, so that
>> could make some tuning if needed.
>>
>> Signed-off-by: Lei Wen <[email protected]>
>> ---
>> include/trace/events/sched.h | 73 ++++++++++++++++++++++++++++++++++++++++++
>> kernel/sched/fair.c | 29 +++++++++++++++--
>> 2 files changed, 100 insertions(+), 2 deletions(-)
>
> [...]
>
>>
>> +TRACE_EVENT(sched_cfs_rq_runnable_load,
>> +
>> + TP_PROTO(int cpu, unsigned long load, unsigned long total),
>> +
>> + TP_ARGS(cpu, load, total),
>> +
>> + TP_STRUCT__entry(
>> + __field(int, cpu)
>> + __field(unsigned long, load)
>> + __field(unsigned long, total)
>> + ),
>> +
>> + TP_fast_assign(
>> + __entry->cpu = cpu;
>> + __entry->load = load;
>> + __entry->total = total;
>> + ),
>> +
>> + TP_printk("cpu=%d avg=%lu total=%lu",
>> + __entry->cpu,
>> + __entry->load,
>> + __entry->total)
>> +);
>> +
>> +TRACE_EVENT(sched_cfs_rq_blocked_load,
>> +
>> + TP_PROTO(int cpu, unsigned long load, unsigned long total),
>> +
>> + TP_ARGS(cpu, load, total),
>> +
>> + TP_STRUCT__entry(
>> + __field(int, cpu)
>> + __field(unsigned long, load)
>> + __field(unsigned long, total)
>> + ),
>> +
>> + TP_fast_assign(
>> + __entry->cpu = cpu;
>> + __entry->load = load;
>> + __entry->total = total;
>> + ),
>> +
>> + TP_printk("cpu=%d avg=%lu total=%lu",
>> + __entry->cpu,
>> + __entry->load,
>> + __entry->total)
>> +);
>> +
>> #endif /* _TRACE_SCHED_H */
>
> above trace points are same and be folded using EVENT_CLASS:

Nice abstract. I would merge your change for my V2 patch.

Thanks,
Lei

>
> +DECLARE_EVENT_CLASS(sched_cfs_rq_load_contri_template,
> +
> + TP_PROTO(int cpu, unsigned long load, unsigned long total),
> +
> + TP_ARGS(cpu, load, total),
> +
> + TP_STRUCT__entry(
> + __field(int, cpu)
> + __field(unsigned long, load)
> + __field(unsigned long, total)
> + ),
> +
> + TP_fast_assign(
> + __entry->cpu = cpu;
> + __entry->load = load;
> + __entry->total = total;
> + ),
> +
> + TP_printk("cpu=%d avg=%lu total=%lu",
> + __entry->cpu,
> + __entry->load,
> + __entry->total)
> +);
> +
> +DEFINE_EVENT(sched_cfs_rq_load_contri_template, sched_cfs_rq_runnable_load,
> + TP_PROTO(int cpu, unsigned long load, unsigned long total),
> + TP_ARGS(cpu, load, total));
> +
> +DEFINE_EVENT(sched_cfs_rq_load_contri_template, sched_cfs_rq_blocked_load,
> + TP_PROTO(int cpu, unsigned long load, unsigned long total),
> + TP_ARGS(cpu, load, total));
> +
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2013-07-01 12:34:16

by Lei Wen

[permalink] [raw]
Subject: [V2 1/2] sched: add trace events for task and rq usage tracking

Since we could track task in the entity level now, we may want to
investigate tasks' running status by recording the trace info, so that
could make some tuning if needed.

Signed-off-by: Lei Wen <[email protected]>
---
include/trace/events/sched.h | 57 ++++++++++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 29 +++++++++++++++++++--
2 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index e5586ca..effe047 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -430,6 +430,63 @@ TRACE_EVENT(sched_pi_setprio,
__entry->oldprio, __entry->newprio)
);

+TRACE_EVENT(sched_task_weighted_load,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long load, unsigned long weight),
+
+ TP_ARGS(tsk, load, weight),
+
+ TP_STRUCT__entry(
+ __field(pid_t, pid)
+ __field(int, cpu)
+ __field(unsigned long, load)
+ __field(unsigned long, weight)
+ ),
+
+ TP_fast_assign(
+ __entry->pid = tsk->pid;
+ __entry->cpu = task_thread_info(tsk)->cpu;
+ __entry->load = load;
+ __entry->weight= weight;
+ ),
+
+ TP_printk("cpu=%d pid=%d load=%lu weight=%lu",
+ __entry->cpu, __entry->pid,
+ __entry->load, __entry->weight)
+);
+
+DECLARE_EVENT_CLASS(sched_cfs_rq_load_contri_template,
+
+ TP_PROTO(int cpu, unsigned long load, unsigned long total),
+
+ TP_ARGS(cpu, load, total),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ __field(unsigned long, load)
+ __field(unsigned long, total)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->load = load;
+ __entry->total = total;
+ ),
+
+ TP_printk("cpu=%d avg=%lu total=%lu",
+ __entry->cpu,
+ __entry->load,
+ __entry->total)
+ );
+
+DEFINE_EVENT(sched_cfs_rq_load_contri_template, sched_cfs_rq_runnable_load,
+ TP_PROTO(int cpu, unsigned long load, unsigned long total),
+ TP_ARGS(cpu, load, total));
+
+DEFINE_EVENT(sched_cfs_rq_load_contri_template, sched_cfs_rq_blocked_load,
+ TP_PROTO(int cpu, unsigned long load, unsigned long total),
+ TP_ARGS(cpu, load, total));
+
#endif /* _TRACE_SCHED_H */

/* This part must be outside protection */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f77f9c5..07bd74c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1346,6 +1346,7 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
return 0;

se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+ trace_sched_task_weighted_load(task_of(se), se->avg.load_avg_contrib, se->load.weight);
se->avg.decay_count = 0;

return decays;
@@ -1445,6 +1446,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
contrib /= (se->avg.runnable_avg_period + 1);
se->avg.load_avg_contrib = scale_load(contrib);
+ trace_sched_task_weighted_load(task_of(se), se->avg.load_avg_contrib, se->load.weight);
}

/* Compute the current contribution to load_avg by se, return any delta */
@@ -1498,10 +1500,16 @@ static inline void update_entity_load_avg(struct sched_entity *se,
if (!update_cfs_rq)
return;

- if (se->on_rq)
+ if (se->on_rq) {
cfs_rq->runnable_load_avg += contrib_delta;
- else
+ trace_sched_cfs_rq_runnable_load(cpu_of(rq_of(cfs_rq)),
+ cfs_rq->runnable_load_avg, cfs_rq->load.weight);
+ } else {
subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+ trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+ cfs_rq->blocked_load_avg,
+ cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
+ }
}

/*
@@ -1531,6 +1539,9 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
}

__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+ trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+ cfs_rq->blocked_load_avg,
+ cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
}

static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
@@ -1584,10 +1595,15 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
/* migrated tasks did not contribute to our blocked load */
if (wakeup) {
subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+ trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+ cfs_rq->blocked_load_avg,
+ cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
update_entity_load_avg(se, 0);
}

cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+ trace_sched_cfs_rq_runnable_load(cpu_of(rq_of(cfs_rq)),
+ cfs_rq->runnable_load_avg, cfs_rq->load.weight);
/* we force update consideration on load-balancer moves */
update_cfs_rq_blocked_load(cfs_rq, !wakeup);
}
@@ -1608,6 +1624,9 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
if (sleep) {
cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+ trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+ cfs_rq->blocked_load_avg,
+ cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
} /* migrations, e.g. sleep=0 leave decay_count == 0 */
}
@@ -5894,6 +5913,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
__synchronize_entity_decay(&p->se);
subtract_blocked_load_contrib(cfs_rq,
p->se.avg.load_avg_contrib);
+ trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+ cfs_rq->blocked_load_avg,
+ cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
}
#endif
}
@@ -5994,6 +6016,9 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
*/
p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+ trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
+ cfs_rq->blocked_load_avg,
+ cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
#endif
}
}
--
1.7.10.4

2013-07-01 12:36:29

by Lei Wen

[permalink] [raw]
Subject: [PATCH V2 0/2] sched: add trace event for per-entity tracking

Thanks for the per-entity tracking feature, we could know the details of
each task by its help.
This patch add its trace support, so that we could quickly know the system
status in a large time scale.

The "cfs_rq->runnable_load_avg/cfs_rq->load.weight" is useful in identify
load distribution status in the whole system

V2: Abstract sched_cfs_rq_runnable_load and sched_cfs_rq_blocked_load using
sched_cfs_rq_load_contri_template. Thanks Kamalesh for this contribution!

Lei Wen (2):
sched: add trace events for task and rq usage tracking
sched: update cfs_rq weight earlier in enqueue_entity

include/trace/events/sched.h | 73 ++++++++++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 31 ++++++++++++++++--
2 files changed, 101 insertions(+), 3 deletions(-)

--
1.7.10.4

2013-07-01 12:37:05

by Lei Wen

[permalink] [raw]
Subject: [V2 2/2] sched: update cfs_rq weight earlier in enqueue_entity

Since we are going to calculate cfs_rq's average ratio by
runnable_load_avg/load.weight, if not increase the load.weight prior to
enqueue_entity_load_avg, it may lead to one cfs_rq's avg ratio higher
than 100%.

Adjust the sequence, so that all ratio is kept below 100%.

Signed-off-by: Lei Wen <[email protected]>
---
kernel/sched/fair.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 07bd74c..d1eee84 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1788,8 +1788,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
account_entity_enqueue(cfs_rq, se);
+ enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
update_cfs_shares(cfs_rq);

if (flags & ENQUEUE_WAKEUP) {
--
1.7.10.4

2013-07-01 12:44:51

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [V2 1/2] sched: add trace events for task and rq usage tracking

On Mon, Jul 01, 2013 at 08:33:21PM +0800, Lei Wen wrote:
> Since we could track task in the entity level now, we may want to
> investigate tasks' running status by recording the trace info, so that
> could make some tuning if needed.

Why would I want to merge this?


> + trace_sched_task_weighted_load(task_of(se), se->avg.load_avg_contrib, se->load.weight);
> + trace_sched_task_weighted_load(task_of(se), se->avg.load_avg_contrib, se->load.weight);

> + trace_sched_cfs_rq_runnable_load(cpu_of(rq_of(cfs_rq)),
> + cfs_rq->runnable_load_avg, cfs_rq->load.weight);

> + trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
> + cfs_rq->blocked_load_avg,
> + cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);

> + trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
> + cfs_rq->blocked_load_avg,
> + cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);

> + trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
> + cfs_rq->blocked_load_avg,
> + cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);

> + trace_sched_cfs_rq_runnable_load(cpu_of(rq_of(cfs_rq)),
> + cfs_rq->runnable_load_avg, cfs_rq->load.weight);

> + trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
> + cfs_rq->blocked_load_avg,
> + cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);

> + trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
> + cfs_rq->blocked_load_avg,
> + cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);

> + trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
> + cfs_rq->blocked_load_avg,
> + cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);

You're not lazy enough by far, you seem to delight in endless repetition :/

How about you first convince me we actually want to merge this; big hint,
there's a significant lack of tracepoints in the entire balancer.

Secondly; WTH didn't you do:

trace_sched_task_weighted_load(se);
trace_sched_cfs_rq_runnable_load(cfs_rq);
trace_sched_cfs_rq_blocked_load(cfs_rq);

The tracepoints themselves could very well extract whatever they want from
that; no need to actually write it out.

2013-07-01 13:25:31

by Lei Wen

[permalink] [raw]
Subject: Re: [V2 1/2] sched: add trace events for task and rq usage tracking

Hi Peter,

On Mon, Jul 1, 2013 at 8:44 PM, Peter Zijlstra <[email protected]> wrote:
> On Mon, Jul 01, 2013 at 08:33:21PM +0800, Lei Wen wrote:
>> Since we could track task in the entity level now, we may want to
>> investigate tasks' running status by recording the trace info, so that
>> could make some tuning if needed.
>
> Why would I want to merge this?

With the merged trace point like those, we could then draw the load
distribution picture easily.

>
>
>> + trace_sched_task_weighted_load(task_of(se), se->avg.load_avg_contrib, se->load.weight);
>> + trace_sched_task_weighted_load(task_of(se), se->avg.load_avg_contrib, se->load.weight);
>
>> + trace_sched_cfs_rq_runnable_load(cpu_of(rq_of(cfs_rq)),
>> + cfs_rq->runnable_load_avg, cfs_rq->load.weight);
>
>> + trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
>> + cfs_rq->blocked_load_avg,
>> + cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
>
>> + trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
>> + cfs_rq->blocked_load_avg,
>> + cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
>
>> + trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
>> + cfs_rq->blocked_load_avg,
>> + cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
>
>> + trace_sched_cfs_rq_runnable_load(cpu_of(rq_of(cfs_rq)),
>> + cfs_rq->runnable_load_avg, cfs_rq->load.weight);
>
>> + trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
>> + cfs_rq->blocked_load_avg,
>> + cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
>
>> + trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
>> + cfs_rq->blocked_load_avg,
>> + cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
>
>> + trace_sched_cfs_rq_blocked_load(cpu_of(rq_of(cfs_rq)),
>> + cfs_rq->blocked_load_avg,
>> + cfs_rq->blocked_load_avg + cfs_rq->runnable_load_avg);
>
> You're not lazy enough by far, you seem to delight in endless repetition :/

Yep, I already notice this duplicated...


>
> How about you first convince me we actually want to merge this; big hint,
> there's a significant lack of tracepoints in the entire balancer.

You already said what I want to say. :)
With the pre-embedded tracepoint, we could make our life easy over tracking
the system load, especially since the per-entity load tracking is
recently added,
people may want to use those trace point to get better understanding for
this new feature.

>
> Secondly; WTH didn't you do:
>
> trace_sched_task_weighted_load(se);
> trace_sched_cfs_rq_runnable_load(cfs_rq);
> trace_sched_cfs_rq_blocked_load(cfs_rq);

So cleaner than my previous one!

>
> The tracepoints themselves could very well extract whatever they want from
> that; no need to actually write it out.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2013-07-01 14:07:48

by Paul Turner

[permalink] [raw]
Subject: Re: [V2 2/2] sched: update cfs_rq weight earlier in enqueue_entity

Could you please restate the below?

On Mon, Jul 1, 2013 at 5:33 AM, Lei Wen <[email protected]> wrote:
> Since we are going to calculate cfs_rq's average ratio by
> runnable_load_avg/load.weight

I don't understand what you mean by this.

>, if not increase the load.weight prior to
> enqueue_entity_load_avg, it may lead to one cfs_rq's avg ratio higher
> than 100%.
>

Or this.

> Adjust the sequence, so that all ratio is kept below 100%.
>
> Signed-off-by: Lei Wen <[email protected]>
> ---
> kernel/sched/fair.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 07bd74c..d1eee84 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1788,8 +1788,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> * Update run-time statistics of the 'current'.
> */
> update_curr(cfs_rq);
> - enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
> account_entity_enqueue(cfs_rq, se);
> + enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);

account_entity_enqueue is independent of enqueue_entity_load_avg;
their order should not matter.

Further, should we restore the reverted amortization commit (improves
context switch times) enqueue_entity_load_avg needs to precede
account_entity_enqueue as it may update se->load.weight.

> update_cfs_shares(cfs_rq);
>
> if (flags & ENQUEUE_WAKEUP) {
> --
> 1.7.10.4
>

2013-07-02 02:52:11

by Lei Wen

[permalink] [raw]
Subject: Re: [V2 2/2] sched: update cfs_rq weight earlier in enqueue_entity

Paul,

On Mon, Jul 1, 2013 at 10:07 PM, Paul Turner <[email protected]> wrote:
> Could you please restate the below?
>
> On Mon, Jul 1, 2013 at 5:33 AM, Lei Wen <[email protected]> wrote:
>> Since we are going to calculate cfs_rq's average ratio by
>> runnable_load_avg/load.weight
>
> I don't understand what you mean by this.

Previously I take runnable_load_avg/load.weight calculation as the cfs_rq's
average ratio. But as Alex point out, the runnable_avg_sum/runnable_avg_period
may better sever this need.

>
>>, if not increase the load.weight prior to
>> enqueue_entity_load_avg, it may lead to one cfs_rq's avg ratio higher
>> than 100%.
>>
>
> Or this.

In my mind, runnable_load_avg in one cfs_rq should always be less than
load.weight.
Not sure whether this assumption stand here, but runnable_load_avg/load.weight
truly could shows out the cfs_rq execution trend in some aspect.

The previous problem that enqueue_entity_load_avg called before
account_entity_enqueue,
which make runnable_load_avg be updated first, then the load.weight.
So that with the trace info log inside of enqueue_entity_load_avg, we
may get the calculation
result for runnable_load_avg/load.weight > 1.
This result is not friendly for the final data being parsed out.


>
>> Adjust the sequence, so that all ratio is kept below 100%.
>>
>> Signed-off-by: Lei Wen <[email protected]>
>> ---
>> kernel/sched/fair.c | 2 +-
>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 07bd74c..d1eee84 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -1788,8 +1788,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>> * Update run-time statistics of the 'current'.
>> */
>> update_curr(cfs_rq);
>> - enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
>> account_entity_enqueue(cfs_rq, se);
>> + enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
>
> account_entity_enqueue is independent of enqueue_entity_load_avg;
> their order should not matter.

Yes, agree, the order should not be matter, but for make trace info
integrated, we may
need some order here.

>
> Further, should we restore the reverted amortization commit (improves
> context switch times)


Not understand here...
What the "should we restore the reverted amortization commit (improves
context switch times)" means here...?


enqueue_entity_load_avg needs to precede
> account_entity_enqueue as it may update se->load.weight.

account_entity_enqueue needs to precede enqueue_entity_load_avg?

Thanks,
Lei


>
>> update_cfs_shares(cfs_rq);
>>
>> if (flags & ENQUEUE_WAKEUP) {
>> --
>> 1.7.10.4
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2013-07-02 12:15:42

by Lei Wen

[permalink] [raw]
Subject: [PATCH V3 0/2] sched: add trace event for per-entity tracking

Thanks for the per-entity tracking feature, we could know the details of
each task by its help.
This patch add its trace support, so that we could quickly know the system
status in a large time scale.

The "cfs_rq->runnable_load_avg/cfs_rq->load.weight" is useful in identify
load distribution status in the whole system

With those pre-embedded tracepoint, we could make our life easy over tracking
the system load, especially since the per-entity load tracking is recently added,
people may want to use those trace point to get better understanding for
this new feature.

V3: make trace events passing parameter being simple, and only extend
its detail in the header file definition. Thanks Peter for pointing out this.

V2: Abstract sched_cfs_rq_runnable_load and sched_cfs_rq_blocked_load using
sched_cfs_rq_load_contri_template. Thanks Kamalesh for this contribution!

Lei Wen (2):
sched: add trace events for task and rq usage tracking
sched: update cfs_rq weight earlier in enqueue_entity

include/trace/events/sched.h | 73 ++++++++++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 31 ++++++++++++++++--
2 files changed, 101 insertions(+), 3 deletions(-)

--
1.7.10.4

2013-07-02 12:20:20

by Lei Wen

[permalink] [raw]
Subject: [V3 1/2] sched: add trace events for task and rq usage tracking

Since we could track task in the entity level now, we may want to
investigate tasks' running status by recording the trace info, so that
could make some tuning if needed.

Signed-off-by: Lei Wen <[email protected]>
Cc: Alex Shi <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Kamalesh Babulal <[email protected]>
---
include/trace/events/sched.h | 76 ++++++++++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 15 +++++++--
2 files changed, 89 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index e5586ca..768b398 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -430,6 +430,82 @@ TRACE_EVENT(sched_pi_setprio,
__entry->oldprio, __entry->newprio)
);

+#ifdef CONFIG_SMP
+TRACE_EVENT(sched_task_weighted_load,
+
+ TP_PROTO(struct sched_entity *se),
+
+ TP_ARGS(se),
+
+ TP_STRUCT__entry(
+ __field(pid_t, pid)
+ __field(int, cpu)
+ __field(unsigned long, load)
+ __field(unsigned long, weight)
+ ),
+
+ TP_fast_assign(
+ __entry->pid = container_of(se, struct task_struct, se)->pid;
+ __entry->cpu = se->cfs_rq->rq->cpu;
+ __entry->load = se->avg.load_avg_contrib;
+ __entry->weight= se->load.weight;
+ ),
+
+ TP_printk("cpu=%d pid=%d load=%lu weight=%lu",
+ __entry->cpu, __entry->pid,
+ __entry->load, __entry->weight)
+);
+
+TRACE_EVENT(sched_cfs_rq_runnable_load,
+
+ TP_PROTO(struct cfs_rq *cfs_rq),
+
+ TP_ARGS(cfs_rq),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ __field(unsigned long, load)
+ __field(unsigned long, total)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cfs_rq->rq->cpu;
+ __entry->load = cfs_rq->runnable_load_avg;
+ __entry->total = cfs_rq->load.weight;
+ ),
+
+ TP_printk("cpu=%d avg=%lu total=%lu",
+ __entry->cpu,
+ __entry->load,
+ __entry->total)
+);
+
+TRACE_EVENT(sched_cfs_rq_blocked_load,
+
+ TP_PROTO(struct cfs_rq *cfs_rq),
+
+ TP_ARGS(cfs_rq),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ __field(unsigned long, load)
+ __field(unsigned long, total)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cfs_rq->rq->cpu;
+ __entry->load = cfs_rq->blocked_load_avg;
+ __entry->total = cfs_rq->blocked_load_avg
+ + cfs_rq->runnable_load_avg;
+ ),
+
+ TP_printk("cpu=%d avg=%lu total=%lu",
+ __entry->cpu,
+ __entry->load,
+ __entry->total)
+);
+#endif
+
#endif /* _TRACE_SCHED_H */

/* This part must be outside protection */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f77f9c5..2290469 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1346,6 +1346,7 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
return 0;

se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+ trace_sched_task_weighted_load(se);
se->avg.decay_count = 0;

return decays;
@@ -1445,6 +1446,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
contrib /= (se->avg.runnable_avg_period + 1);
se->avg.load_avg_contrib = scale_load(contrib);
+ trace_sched_task_weighted_load(se);
}

/* Compute the current contribution to load_avg by se, return any delta */
@@ -1498,10 +1500,13 @@ static inline void update_entity_load_avg(struct sched_entity *se,
if (!update_cfs_rq)
return;

- if (se->on_rq)
+ if (se->on_rq) {
cfs_rq->runnable_load_avg += contrib_delta;
- else
+ trace_sched_cfs_rq_runnable_load(cfs_rq);
+ } else {
subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+ trace_sched_cfs_rq_blocked_load(cfs_rq);
+ }
}

/*
@@ -1531,6 +1536,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
}

__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+ trace_sched_cfs_rq_blocked_load(cfs_rq);
}

static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
@@ -1584,10 +1590,12 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
/* migrated tasks did not contribute to our blocked load */
if (wakeup) {
subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+ trace_sched_cfs_rq_blocked_load(cfs_rq);
update_entity_load_avg(se, 0);
}

cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+ trace_sched_cfs_rq_runnable_load(cfs_rq);
/* we force update consideration on load-balancer moves */
update_cfs_rq_blocked_load(cfs_rq, !wakeup);
}
@@ -1608,6 +1616,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
if (sleep) {
cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+ trace_sched_cfs_rq_blocked_load(cfs_rq);
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
} /* migrations, e.g. sleep=0 leave decay_count == 0 */
}
@@ -5894,6 +5903,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
__synchronize_entity_decay(&p->se);
subtract_blocked_load_contrib(cfs_rq,
p->se.avg.load_avg_contrib);
+ trace_sched_cfs_rq_blocked_load(cfs_rq);
}
#endif
}
@@ -5994,6 +6004,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
*/
p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+ trace_sched_cfs_rq_blocked_load(cfs_rq);
#endif
}
}
--
1.7.10.4

2013-07-02 12:21:04

by Lei Wen

[permalink] [raw]
Subject: [V3 2/2] sched: update cfs_rq weight earlier in enqueue_entity

We are expecting runnable_load_avg is less than load.weight,
So that runnable_load_avg/load.weight could well present out the system's
load distribution.

if not increase the load.weight prior to enqueue_entity_load_avg, it may
lead to runnable_load_avg is higher than load.weight, so that people
may get confused.

Signed-off-by: Lei Wen <[email protected]>
Cc: Alex Shi <[email protected]>
Cc: Paul Turner <[email protected]>
---
kernel/sched/fair.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2290469..53224d1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1778,8 +1778,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
account_entity_enqueue(cfs_rq, se);
+ enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
update_cfs_shares(cfs_rq);

if (flags & ENQUEUE_WAKEUP) {
--
1.7.10.4

2013-07-03 12:46:15

by Lei Wen

[permalink] [raw]
Subject: Re: [V3 1/2] sched: add trace events for task and rq usage tracking

Hi Peter,

Do you have some further suggestion for this patch? :)

Thanks,
Lei

On Tue, Jul 2, 2013 at 8:15 PM, Lei Wen <[email protected]> wrote:
> Since we could track task in the entity level now, we may want to
> investigate tasks' running status by recording the trace info, so that
> could make some tuning if needed.
>
> Signed-off-by: Lei Wen <[email protected]>
> Cc: Alex Shi <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Kamalesh Babulal <[email protected]>
> ---
> include/trace/events/sched.h | 76 ++++++++++++++++++++++++++++++++++++++++++
> kernel/sched/fair.c | 15 +++++++--
> 2 files changed, 89 insertions(+), 2 deletions(-)
>
> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> index e5586ca..768b398 100644
> --- a/include/trace/events/sched.h
> +++ b/include/trace/events/sched.h
> @@ -430,6 +430,82 @@ TRACE_EVENT(sched_pi_setprio,
> __entry->oldprio, __entry->newprio)
> );
>
> +#ifdef CONFIG_SMP
> +TRACE_EVENT(sched_task_weighted_load,
> +
> + TP_PROTO(struct sched_entity *se),
> +
> + TP_ARGS(se),
> +
> + TP_STRUCT__entry(
> + __field(pid_t, pid)
> + __field(int, cpu)
> + __field(unsigned long, load)
> + __field(unsigned long, weight)
> + ),
> +
> + TP_fast_assign(
> + __entry->pid = container_of(se, struct task_struct, se)->pid;
> + __entry->cpu = se->cfs_rq->rq->cpu;
> + __entry->load = se->avg.load_avg_contrib;
> + __entry->weight= se->load.weight;
> + ),
> +
> + TP_printk("cpu=%d pid=%d load=%lu weight=%lu",
> + __entry->cpu, __entry->pid,
> + __entry->load, __entry->weight)
> +);
> +
> +TRACE_EVENT(sched_cfs_rq_runnable_load,
> +
> + TP_PROTO(struct cfs_rq *cfs_rq),
> +
> + TP_ARGS(cfs_rq),
> +
> + TP_STRUCT__entry(
> + __field(int, cpu)
> + __field(unsigned long, load)
> + __field(unsigned long, total)
> + ),
> +
> + TP_fast_assign(
> + __entry->cpu = cfs_rq->rq->cpu;
> + __entry->load = cfs_rq->runnable_load_avg;
> + __entry->total = cfs_rq->load.weight;
> + ),
> +
> + TP_printk("cpu=%d avg=%lu total=%lu",
> + __entry->cpu,
> + __entry->load,
> + __entry->total)
> +);
> +
> +TRACE_EVENT(sched_cfs_rq_blocked_load,
> +
> + TP_PROTO(struct cfs_rq *cfs_rq),
> +
> + TP_ARGS(cfs_rq),
> +
> + TP_STRUCT__entry(
> + __field(int, cpu)
> + __field(unsigned long, load)
> + __field(unsigned long, total)
> + ),
> +
> + TP_fast_assign(
> + __entry->cpu = cfs_rq->rq->cpu;
> + __entry->load = cfs_rq->blocked_load_avg;
> + __entry->total = cfs_rq->blocked_load_avg
> + + cfs_rq->runnable_load_avg;
> + ),
> +
> + TP_printk("cpu=%d avg=%lu total=%lu",
> + __entry->cpu,
> + __entry->load,
> + __entry->total)
> +);
> +#endif
> +
> #endif /* _TRACE_SCHED_H */
>
> /* This part must be outside protection */
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index f77f9c5..2290469 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1346,6 +1346,7 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
> return 0;
>
> se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
> + trace_sched_task_weighted_load(se);
> se->avg.decay_count = 0;
>
> return decays;
> @@ -1445,6 +1446,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
> contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
> contrib /= (se->avg.runnable_avg_period + 1);
> se->avg.load_avg_contrib = scale_load(contrib);
> + trace_sched_task_weighted_load(se);
> }
>
> /* Compute the current contribution to load_avg by se, return any delta */
> @@ -1498,10 +1500,13 @@ static inline void update_entity_load_avg(struct sched_entity *se,
> if (!update_cfs_rq)
> return;
>
> - if (se->on_rq)
> + if (se->on_rq) {
> cfs_rq->runnable_load_avg += contrib_delta;
> - else
> + trace_sched_cfs_rq_runnable_load(cfs_rq);
> + } else {
> subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
> + trace_sched_cfs_rq_blocked_load(cfs_rq);
> + }
> }
>
> /*
> @@ -1531,6 +1536,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
> }
>
> __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
> + trace_sched_cfs_rq_blocked_load(cfs_rq);
> }
>
> static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
> @@ -1584,10 +1590,12 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
> /* migrated tasks did not contribute to our blocked load */
> if (wakeup) {
> subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
> + trace_sched_cfs_rq_blocked_load(cfs_rq);
> update_entity_load_avg(se, 0);
> }
>
> cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
> + trace_sched_cfs_rq_runnable_load(cfs_rq);
> /* we force update consideration on load-balancer moves */
> update_cfs_rq_blocked_load(cfs_rq, !wakeup);
> }
> @@ -1608,6 +1616,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
> cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
> if (sleep) {
> cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
> + trace_sched_cfs_rq_blocked_load(cfs_rq);
> se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
> } /* migrations, e.g. sleep=0 leave decay_count == 0 */
> }
> @@ -5894,6 +5903,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
> __synchronize_entity_decay(&p->se);
> subtract_blocked_load_contrib(cfs_rq,
> p->se.avg.load_avg_contrib);
> + trace_sched_cfs_rq_blocked_load(cfs_rq);
> }
> #endif
> }
> @@ -5994,6 +6004,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
> */
> p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
> cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
> + trace_sched_cfs_rq_blocked_load(cfs_rq);
> #endif
> }
> }
> --
> 1.7.10.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/