2015-12-01 15:29:39

by Daniel Vetter

[permalink] [raw]
Subject: [PATCH 1/2] kernel/latencytop: Add non-scheduler interface for latency reporting

Some sources of significant amounts of latency aren't simple sleeps
but instead busy-loops or a series of hundreds of small sleeps simply
because the hardware can't do better. Unfortunately latencytop doesn't
register these and so they slip under the radar. Hence expose a
simplified interface to report additional latencies and export the
underlying function so that modules can use this.

The example I have in mind are edid reads. The drm subsystem exposes
both interfaces to do full probes and to just get at the cached state
from the last probe and often userspace developers don't know about
the difference and incur unecessary big latencies. And usually the i2c
transfer is done with busy-looping or if there is a hw engine it might
only be able to transfer a few bytes per sleep/irq cycle. And edid
reads take at least 12ms and with crappy hw can easily be a few
hundred ms.

Cc: Thomas Gleixner <[email protected]>
Cc: Arjan van de Ven <[email protected]>
Cc: Andrew Morton <[email protected]>
Signed-off-by: Daniel Vetter <[email protected]>
---
include/linux/latencytop.h | 15 +++++++++++++++
kernel/latencytop.c | 2 ++
2 files changed, 17 insertions(+)

diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h
index e23121f9d82a..46b69bc35f02 100644
--- a/include/linux/latencytop.h
+++ b/include/linux/latencytop.h
@@ -10,6 +10,9 @@
#define _INCLUDE_GUARD_LATENCYTOP_H_

#include <linux/compiler.h>
+
+#include <asm/current.h>
+
struct task_struct;

#ifdef CONFIG_LATENCYTOP
@@ -35,6 +38,13 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter)
__account_scheduler_latency(task, usecs, inter);
}

+static inline void
+account_latency(int usecs)
+{
+ if (unlikely(latencytop_enabled))
+ __account_scheduler_latency(current, usecs, 0);
+}
+
void clear_all_latency_tracing(struct task_struct *p);

#else
@@ -44,6 +54,11 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter)
{
}

+static inline void
+account_latency(int usecs)
+{
+}
+
static inline void clear_all_latency_tracing(struct task_struct *p)
{
}
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index a02812743a7e..b066a19fc52a 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -64,6 +64,7 @@ static DEFINE_RAW_SPINLOCK(latency_lock);
static struct latency_record latency_record[MAXLR];

int latencytop_enabled;
+EXPORT_SYMBOL_GPL(latencytop_enabled);

void clear_all_latency_tracing(struct task_struct *p)
{
@@ -234,6 +235,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
out_unlock:
raw_spin_unlock_irqrestore(&latency_lock, flags);
}
+EXPORT_SYMBOL_GPL(__account_scheduler_latency);

static int lstats_show(struct seq_file *m, void *v)
{
--
2.5.1


2015-12-01 15:30:17

by Daniel Vetter

[permalink] [raw]
Subject: [PATCH 2/2] drm/edid: report latency due to reading edids

A forced EDID read takes 22.5ms best-case, and that's per 128byte
block. HDMI screens tend to have 2-3 of those. Mutliply that by a few
outputs and then it's clear that userspace really never ever should
re-probe connector state on its own and trust the kernel to tell it
when anything changed. The only exception is a manual reprobe button that
the user must press itself (for extremely shitty KVM switches that
don't wire up hotplug handling properly).

There have been bugs in the past, but we're slowly fixing them up. To
the point even that some of the most abused interfaces (e.g. in sysfs)
have been changed to only return the cached state ever due to too much
polling by userspace.

But there's other places where we can't pull these tricks, so give
userspace the tools to notice their abuse and expose delays due to
EDID reads in latencytop.

Cc: Thomas Gleixner <[email protected]>
Cc: Arjan van de Ven <[email protected]>
Cc: Andrew Morton <[email protected]>
Signed-off-by: Daniel Vetter <[email protected]>
---
drivers/gpu/drm/drm_edid.c | 27 +++++++++++++++++++--------
1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index c214f1246cb4..370003e0cc69 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -32,6 +32,7 @@
#include <linux/hdmi.h>
#include <linux/i2c.h>
#include <linux/module.h>
+#include <linux/latencytop.h>
#include <drm/drmP.h>
#include <drm/drm_edid.h>
#include <drm/drm_displayid.h>
@@ -1272,14 +1273,17 @@ struct edid *drm_do_get_edid(struct drm_connector *connector,
int i, j = 0, valid_extensions = 0;
u8 *block, *new;
bool print_bad_edid = !connector->bad_edid_counter || (drm_debug & DRM_UT_KMS);
+ u64 before, after;

if ((block = kmalloc(EDID_LENGTH, GFP_KERNEL)) == NULL)
return NULL;

+ before = ktime_get_raw_ns();
+
/* base block fetch */
for (i = 0; i < 4; i++) {
if (get_edid_block(data, block, 0, EDID_LENGTH))
- goto out;
+ goto none;
if (drm_edid_block_valid(block, 0, print_bad_edid,
&connector->edid_corrupt))
break;
@@ -1293,11 +1297,11 @@ struct edid *drm_do_get_edid(struct drm_connector *connector,

/* if there's no extensions, we're done */
if (block[0x7e] == 0)
- return (struct edid *)block;
+ goto out;

new = krealloc(block, (block[0x7e] + 1) * EDID_LENGTH, GFP_KERNEL);
if (!new)
- goto out;
+ goto none;
block = new;

for (j = 1; j <= block[0x7e]; j++) {
@@ -1305,7 +1309,7 @@ struct edid *drm_do_get_edid(struct drm_connector *connector,
if (get_edid_block(data,
block + (valid_extensions + 1) * EDID_LENGTH,
j, EDID_LENGTH))
- goto out;
+ goto none;
if (drm_edid_block_valid(block + (valid_extensions + 1)
* EDID_LENGTH, j,
print_bad_edid,
@@ -1329,11 +1333,11 @@ struct edid *drm_do_get_edid(struct drm_connector *connector,
block[0x7e] = valid_extensions;
new = krealloc(block, (valid_extensions + 1) * EDID_LENGTH, GFP_KERNEL);
if (!new)
- goto out;
+ goto none;
block = new;
}

- return (struct edid *)block;
+ goto out;

carp:
if (print_bad_edid) {
@@ -1342,9 +1346,16 @@ carp:
}
connector->bad_edid_counter++;

-out:
+none:
kfree(block);
- return NULL;
+ block = NULL;
+
+out:
+ after = ktime_get_raw_ns();
+
+ account_latency(DIV_ROUND_UP_ULL(after - before, 1000));
+
+ return (struct edid *)block;
}
EXPORT_SYMBOL_GPL(drm_do_get_edid);

--
2.5.1

2015-12-01 15:47:19

by Chris Wilson

[permalink] [raw]
Subject: Re: [PATCH 1/2] kernel/latencytop: Add non-scheduler interface for latency reporting

On Tue, Dec 01, 2015 at 04:29:27PM +0100, Daniel Vetter wrote:
> Some sources of significant amounts of latency aren't simple sleeps
> but instead busy-loops or a series of hundreds of small sleeps simply
> because the hardware can't do better. Unfortunately latencytop doesn't
> register these and so they slip under the radar. Hence expose a
> simplified interface to report additional latencies and export the
> underlying function so that modules can use this.
>
> The example I have in mind are edid reads. The drm subsystem exposes
> both interfaces to do full probes and to just get at the cached state
> from the last probe and often userspace developers don't know about
> the difference and incur unecessary big latencies. And usually the i2c
> transfer is done with busy-looping or if there is a hw engine it might
> only be able to transfer a few bytes per sleep/irq cycle. And edid
> reads take at least 12ms and with crappy hw can easily be a few
> hundred ms.
>
> Cc: Thomas Gleixner <[email protected]>
> Cc: Arjan van de Ven <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Signed-off-by: Daniel Vetter <[email protected]>
> ---
> include/linux/latencytop.h | 15 +++++++++++++++
> kernel/latencytop.c | 2 ++
> 2 files changed, 17 insertions(+)
>
> diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h
> index e23121f9d82a..46b69bc35f02 100644
> --- a/include/linux/latencytop.h
> +++ b/include/linux/latencytop.h
> @@ -10,6 +10,9 @@
> #define _INCLUDE_GUARD_LATENCYTOP_H_
>
> #include <linux/compiler.h>
> +
> +#include <asm/current.h>
> +
> struct task_struct;
>
> #ifdef CONFIG_LATENCYTOP
> @@ -35,6 +38,13 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter)
> __account_scheduler_latency(task, usecs, inter);
> }
>
> +static inline void
> +account_latency(int usecs)
> +{
> + if (unlikely(latencytop_enabled))
> + __account_scheduler_latency(current, usecs, 0);

Just

account_scheduler_latency(current, usecs, 0);
> +}

And then that can be used for both ifdef paths, i.e. move account_latency() to after the #endif.
-Chris

--
Chris Wilson, Intel Open Source Technology Centre

2015-12-01 15:54:13

by Daniel Vetter

[permalink] [raw]
Subject: [PATCH] kernel/latencytop: Add non-scheduler interface for latency reporting

Some sources of significant amounts of latency aren't simple sleeps
but instead busy-loops or a series of hundreds of small sleeps simply
because the hardware can't do better. Unfortunately latencytop doesn't
register these and so they slip under the radar. Hence expose a
simplified interface to report additional latencies and export the
underlying function so that modules can use this.

The example I have in mind are edid reads. The drm subsystem exposes
both interfaces to do full probes and to just get at the cached state
from the last probe and often userspace developers don't know about
the difference and incur unecessary big latencies. And usually the i2c
transfer is done with busy-looping or if there is a hw engine it might
only be able to transfer a few bytes per sleep/irq cycle. And edid
reads take at least 12ms and with crappy hw can easily be a few
hundred ms.

v2: Simplify #ifdefs a bit (Chris).

Cc: Chris Wilson <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Arjan van de Ven <[email protected]>
Cc: Andrew Morton <[email protected]>
Signed-off-by: Daniel Vetter <[email protected]>
---
include/linux/latencytop.h | 9 +++++++++
kernel/latencytop.c | 2 ++
2 files changed, 11 insertions(+)

diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h
index e23121f9d82a..6f7c35a0bbfe 100644
--- a/include/linux/latencytop.h
+++ b/include/linux/latencytop.h
@@ -10,6 +10,9 @@
#define _INCLUDE_GUARD_LATENCYTOP_H_

#include <linux/compiler.h>
+
+#include <asm/current.h>
+
struct task_struct;

#ifdef CONFIG_LATENCYTOP
@@ -50,4 +53,10 @@ static inline void clear_all_latency_tracing(struct task_struct *p)

#endif

+static inline void
+account_latency(int usecs)
+{
+ account_scheduler_latency(current, usecs, 0);
+}
+
#endif
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index a02812743a7e..b066a19fc52a 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -64,6 +64,7 @@ static DEFINE_RAW_SPINLOCK(latency_lock);
static struct latency_record latency_record[MAXLR];

int latencytop_enabled;
+EXPORT_SYMBOL_GPL(latencytop_enabled);

void clear_all_latency_tracing(struct task_struct *p)
{
@@ -234,6 +235,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
out_unlock:
raw_spin_unlock_irqrestore(&latency_lock, flags);
}
+EXPORT_SYMBOL_GPL(__account_scheduler_latency);

static int lstats_show(struct seq_file *m, void *v)
{
--
2.5.1