perf_event_mux_interval_ms_store() tries to apply an updated
hrtimer_interval to all possible cpus in a completely unsafe way. The
changelog of the offending commit says:
"In the 5th version, we handle the reprogramming of the hrtimer using
hrtimer_forward_now(). That way, we sync up to new timer value
quickly (suggested by Jiri Olsa)."
The hrtimer reprogramming is completely unrelated to
hrtimer_forward_now(). hrtimer_forward_now() merily forwards the
expiry time of the hrtimer past now and that's where things get really
bad in this code:
The forward function is called on enqueued timers. That means the
update of the expiry time corrupts the ordering of the hrtimer rbtree.
The proper way to update hrtimers on remote cpus is to use a smp
function call on all online cpus and perform the update locally by
canceling the timer, forwarding and restarting it.
Fixes: 62b856397927 "perf: Add sysfs entry to adjust multiplexing interval per PMU"
Signed-off-by: Thomas Gleixner <[email protected]>
Cc: Stephane Eranian <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: [email protected]
Cc: Arnaldo Carvalho de Melo <[email protected]>
---
kernel/events/core.c | 36 ++++++++++++++++++++----------------
1 file changed, 20 insertions(+), 16 deletions(-)
Index: linux/kernel/events/core.c
===================================================================
--- linux.orig/kernel/events/core.c
+++ linux/kernel/events/core.c
@@ -6827,13 +6827,27 @@ perf_event_mux_interval_ms_show(struct d
return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
}
+static void perf_event_mux_update_interval(void *info)
+{
+ struct pmu *pmu = info;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ int was_armed = hrtimer_cancel(&cpuctx->hrtimer);
+
+ /* Update the interval and restart the timer with the new interval */
+ cpuctx->hrtimer_interval = ms_to_ktime(pmu->hrtimer_interval_ms);
+ if (was_armed) {
+ hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+ hrtimer_start_expires(&cpuctx->hrtimer, HRTIMER_MODE_ABS);
+ }
+}
+
static ssize_t
perf_event_mux_interval_ms_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct pmu *pmu = dev_get_drvdata(dev);
- int timer, cpu, ret;
+ int timer, ret;
ret = kstrtoint(buf, 0, &timer);
if (ret)
@@ -6842,22 +6856,12 @@ perf_event_mux_interval_ms_store(struct
if (timer < 1)
return -EINVAL;
- /* same value, noting to do */
- if (timer == pmu->hrtimer_interval_ms)
- return count;
-
- pmu->hrtimer_interval_ms = timer;
-
- /* update all cpuctx for this PMU */
- for_each_possible_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
-
- if (hrtimer_active(&cpuctx->hrtimer))
- hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+ if (timer != pmu->hrtimer_interval_ms) {
+ get_online_cpus();
+ pmu->hrtimer_interval_ms = timer;
+ on_each_cpu(perf_event_mux_update_interval, pmu, 1);
+ put_online_cpus();
}
-
return count;
}
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
On Mon, Apr 13, 2015 at 2:02 PM, Thomas Gleixner <[email protected]> wrote:
>
> perf_event_mux_interval_ms_store() tries to apply an updated
> hrtimer_interval to all possible cpus in a completely unsafe way. The
> changelog of the offending commit says:
>
> "In the 5th version, we handle the reprogramming of the hrtimer using
> hrtimer_forward_now(). That way, we sync up to new timer value
> quickly (suggested by Jiri Olsa)."
>
> The hrtimer reprogramming is completely unrelated to
> hrtimer_forward_now(). hrtimer_forward_now() merily forwards the
> expiry time of the hrtimer past now and that's where things get really
> bad in this code:
>
> The forward function is called on enqueued timers. That means the
> update of the expiry time corrupts the ordering of the hrtimer rbtree.
>
> The proper way to update hrtimers on remote cpus is to use a smp
> function call on all online cpus and perform the update locally by
> canceling the timer, forwarding and restarting it.
>
> Fixes: 62b856397927 "perf: Add sysfs entry to adjust multiplexing interval per PMU"
> Signed-off-by: Thomas Gleixner <[email protected]>
> Cc: Stephane Eranian <[email protected]>
> Cc: Jiri Olsa <[email protected]>
> Cc: [email protected]
> Cc: Arnaldo Carvalho de Melo <[email protected]>
Thanks for fixing this Thomas!
>
> ---
> kernel/events/core.c | 36 ++++++++++++++++++++----------------
> 1 file changed, 20 insertions(+), 16 deletions(-)
>
> Index: linux/kernel/events/core.c
> ===================================================================
> --- linux.orig/kernel/events/core.c
> +++ linux/kernel/events/core.c
> @@ -6827,13 +6827,27 @@ perf_event_mux_interval_ms_show(struct d
> return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
> }
>
> +static void perf_event_mux_update_interval(void *info)
> +{
> + struct pmu *pmu = info;
> + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
> + int was_armed = hrtimer_cancel(&cpuctx->hrtimer);
> +
> + /* Update the interval and restart the timer with the new interval */
> + cpuctx->hrtimer_interval = ms_to_ktime(pmu->hrtimer_interval_ms);
> + if (was_armed) {
> + hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
> + hrtimer_start_expires(&cpuctx->hrtimer, HRTIMER_MODE_ABS);
> + }
> +}
> +
> static ssize_t
> perf_event_mux_interval_ms_store(struct device *dev,
> struct device_attribute *attr,
> const char *buf, size_t count)
> {
> struct pmu *pmu = dev_get_drvdata(dev);
> - int timer, cpu, ret;
> + int timer, ret;
>
> ret = kstrtoint(buf, 0, &timer);
> if (ret)
> @@ -6842,22 +6856,12 @@ perf_event_mux_interval_ms_store(struct
> if (timer < 1)
> return -EINVAL;
>
> - /* same value, noting to do */
> - if (timer == pmu->hrtimer_interval_ms)
> - return count;
> -
> - pmu->hrtimer_interval_ms = timer;
> -
> - /* update all cpuctx for this PMU */
> - for_each_possible_cpu(cpu) {
> - struct perf_cpu_context *cpuctx;
> - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
> - cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
> -
> - if (hrtimer_active(&cpuctx->hrtimer))
> - hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
> + if (timer != pmu->hrtimer_interval_ms) {
> + get_online_cpus();
> + pmu->hrtimer_interval_ms = timer;
> + on_each_cpu(perf_event_mux_update_interval, pmu, 1);
> + put_online_cpus();
> }
> -
> return count;
> }
> static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
>
>