LinuxLists.cc - [RFC] perf/x86/intel: Account interrupts for PEBS errors

2016-12-14 16:57:45

Subject: [RFC] perf/x86/intel: Account interrupts for PEBS errors

hi,
I'm hitting soft lockup generated by fuzzer, where the
perf hangs in remote_install path like:

NMI watchdog: BUG: soft lockup - CPU#22 stuck for 22s! [perf_fuzzer:5816]

task: ffff880273148000 task.stack: ffffc90002d58000
RIP: 0010:[<ffffffff81159232>] [<ffffffff81159232>] smp_call_function_single+0xe2/0x140
RSP: 0018:ffffc90002d5bd60 EFLAGS: 00000202
...
Call Trace:
[<ffffffff81114ce5>] ? trace_hardirqs_on_caller+0xf5/0x1b0
[<ffffffff811e20e0>] ? perf_cgroup_attach+0x70/0x70
[<ffffffff811e1049>] perf_install_in_context+0x199/0x1b0
[<ffffffff811e74e0>] ? ctx_resched+0x90/0x90
[<ffffffff811ed9d1>] SYSC_perf_event_open+0x641/0xf90
[<ffffffff811f1069>] SyS_perf_event_open+0x9/0x10
[<ffffffff81003edc>] do_syscall_64+0x6c/0x1f0
[<ffffffff818defc9>] entry_SYSCALL64_slow_path+0x25/0x25

I found out that I could reproduce this with following
2 perf commands running simultaneously:

taskset -c 1 ./perf record -c 4 -e branches:pp -j any -C 10

this forces cpu 10 to endless loop causing the soft lockup

AFAICS the reason for this is that intel_pmu_drain_pebs_nhm does
not account event->hw.interrupt for error PEBS interrupts, so in
case you're getting ONLY errors you dont have a way to stop event
when it's over the max_samples_per_tick limit

I added extra accounting for error PEBS and it seems to work now,
fuzzer is running for several hours now ;-)

also I could not reproduce with any other event, just branches plus
the additional branch stack sample type

I also fail to reproduce on other than snb_x (model 45) server

thoughts?

thanks,
jirka

---
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index be202390bbd3..f2010dbe75d6 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1389,9 +1389,13 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
continue;

/* log dropped samples number */
- if (error[bit])
+ if (error[bit]) {
perf_log_lost_samples(event, error[bit]);

+ if (perf_event_account_interrupt(event, 1))
+ x86_pmu_stop(event, 0);
+ }
+
if (counts[bit]) {
__intel_pmu_pebs_event(event, iregs, base,
top, bit, counts[bit]);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4741ecdb9817..7225396228ce 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1259,6 +1259,7 @@ extern void perf_event_disable(struct perf_event *event);
extern void perf_event_disable_local(struct perf_event *event);
extern void perf_event_disable_inatomic(struct perf_event *event);
extern void perf_event_task_tick(void);
+extern int perf_event_account_interrupt(struct perf_event *event, int throttle);
#else /* !CONFIG_PERF_EVENTS: */
static inline void *
perf_aux_output_begin(struct perf_output_handle *handle,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 02c8421f8c01..93b46cc2c977 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7034,25 +7034,11 @@ static void perf_log_itrace_start(struct perf_event *event)
perf_output_end(&handle);
}

-/*
- * Generic event overflow handling, sampling.
- */
-
-static int __perf_event_overflow(struct perf_event *event,
- int throttle, struct perf_sample_data *data,
- struct pt_regs *regs)
+int perf_event_account_interrupt(struct perf_event *event, int throttle)
{
- int events = atomic_read(&event->event_limit);
struct hw_perf_event *hwc = &event->hw;
- u64 seq;
int ret = 0;
-
- /*
- * Non-sampling counters might still use the PMI to fold short
- * hardware counters, ignore those.
- */
- if (unlikely(!is_sampling_event(event)))
- return 0;
+ u64 seq;

seq = __this_cpu_read(perf_throttled_seq);
if (seq != hwc->interrupts_seq) {
@@ -7070,6 +7056,30 @@ static int __perf_event_overflow(struct perf_event *event,
}
}

+ return ret;
+}
+
+/*
+ * Generic event overflow handling, sampling.
+ */
+
+static int __perf_event_overflow(struct perf_event *event,
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ int events = atomic_read(&event->event_limit);
+ struct hw_perf_event *hwc = &event->hw;
+ int ret = 0;
+
+ /*
+ * Non-sampling counters might still use the PMI to fold short
+ * hardware counters, ignore those.
+ */
+ if (unlikely(!is_sampling_event(event)))
+ return 0;
+
+ ret = perf_event_account_interrupt(event, throttle);
+
if (event->attr.freq) {
u64 now = perf_clock();
s64 delta = now - hwc->freq_time_stamp;

2016-12-14 18:16:41

by Jiri Olsa

[permalink] [raw]

Subject: Re: [RFC] perf/x86/intel: Account interrupts for PEBS errors

On Wed, Dec 14, 2016 at 07:07:30PM +0100, Peter Zijlstra wrote:
> On Wed, Dec 14, 2016 at 05:50:36PM +0100, Jiri Olsa wrote:
> >
> > I also fail to reproduce on other than snb_x (model 45) server
>
> reproduces on my ivb-ep as well model 62.
>
> > thoughts?
>
> cute find :-)
>
> > +++ b/arch/x86/events/intel/ds.c
> > @@ -1389,9 +1389,13 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
> > continue;
> >
> > /* log dropped samples number */
> > - if (error[bit])
> > + if (error[bit]) {
> > perf_log_lost_samples(event, error[bit]);
> >
> > + if (perf_event_account_interrupt(event, 1))
>
> Seems a bit daft to expose the .throttle argument, since that would be
> the only point of calling this.

there's also the other caller from __perf_event_overflow

> > +static int __perf_event_overflow(struct perf_event *event,
> > + int throttle, struct perf_sample_data *data,
> > + struct pt_regs *regs)
> > +{
> > + int events = atomic_read(&event->event_limit);
> > + struct hw_perf_event *hwc = &event->hw;
> > + int ret = 0;
> > +
> > + /*
> > + * Non-sampling counters might still use the PMI to fold short
> > + * hardware counters, ignore those.
> > + */
> > + if (unlikely(!is_sampling_event(event)))
> > + return 0;
> > +
> > + ret = perf_event_account_interrupt(event, throttle);
> > +
> > if (event->attr.freq) {
> > u64 now = perf_clock();
> > s64 delta = now - hwc->freq_time_stamp;
>
> Arguably, everything in __perf_event_overflow() except for calling of
> ->overflow_handler() should be done I think.

well, I was wondering about that period adjustment bit

but I wasn't sure about those pending_kill/pending_wakeup bits,
they make sense to me only if we have some data to deliver

jirka

2016-12-14 18:35:37

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC] perf/x86/intel: Account interrupts for PEBS errors

On Wed, Dec 14, 2016 at 05:50:36PM +0100, Jiri Olsa wrote:
>
> I also fail to reproduce on other than snb_x (model 45) server

reproduces on my ivb-ep as well model 62.

> thoughts?

cute find :-)

> +++ b/arch/x86/events/intel/ds.c
> @@ -1389,9 +1389,13 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
> continue;
>
> /* log dropped samples number */
> - if (error[bit])
> + if (error[bit]) {
> perf_log_lost_samples(event, error[bit]);
>
> + if (perf_event_account_interrupt(event, 1))

Seems a bit daft to expose the .throttle argument, since that would be
the only point of calling this.

> +static int __perf_event_overflow(struct perf_event *event,
> + int throttle, struct perf_sample_data *data,
> + struct pt_regs *regs)
> +{
> + int events = atomic_read(&event->event_limit);
> + struct hw_perf_event *hwc = &event->hw;
> + int ret = 0;
> +
> + /*
> + * Non-sampling counters might still use the PMI to fold short
> + * hardware counters, ignore those.
> + */
> + if (unlikely(!is_sampling_event(event)))
> + return 0;
> +
> + ret = perf_event_account_interrupt(event, throttle);
> +
> if (event->attr.freq) {
> u64 now = perf_clock();
> s64 delta = now - hwc->freq_time_stamp;

Arguably, everything in __perf_event_overflow() except for calling of
->overflow_handler() should be done I think.

2016-12-14 20:08:58

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC] perf/x86/intel: Account interrupts for PEBS errors

On Wed, Dec 14, 2016 at 07:16:36PM +0100, Jiri Olsa wrote:

> > > +++ b/arch/x86/events/intel/ds.c
> > > @@ -1389,9 +1389,13 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
> > > continue;
> > >
> > > /* log dropped samples number */
> > > - if (error[bit])
> > > + if (error[bit]) {
> > > perf_log_lost_samples(event, error[bit]);
> > >
> > > + if (perf_event_account_interrupt(event, 1))
> >
> > Seems a bit daft to expose the .throttle argument, since that would be
> > the only point of calling this.
>
> there's also the other caller from __perf_event_overflow

See the below patchlet ;-)

> > > +static int __perf_event_overflow(struct perf_event *event,
> > > + int throttle, struct perf_sample_data *data,
> > > + struct pt_regs *regs)
> > > +{
> > > + int events = atomic_read(&event->event_limit);
> > > + struct hw_perf_event *hwc = &event->hw;
> > > + int ret = 0;
> > > +
> > > + /*
> > > + * Non-sampling counters might still use the PMI to fold short
> > > + * hardware counters, ignore those.
> > > + */
> > > + if (unlikely(!is_sampling_event(event)))
> > > + return 0;
> > > +
> > > + ret = perf_event_account_interrupt(event, throttle);
> > > +
> > > if (event->attr.freq) {
> > > u64 now = perf_clock();
> > > s64 delta = now - hwc->freq_time_stamp;
> >
> > Arguably, everything in __perf_event_overflow() except for calling of
> > ->overflow_handler() should be done I think.
>
> well, I was wondering about that period adjustment bit
>
> but I wasn't sure about those pending_kill/pending_wakeup bits,
> they make sense to me only if we have some data to deliver

Hmm, maybe. Please add a comment, that way we can at least rediscover we
thought about this.

--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1392,7 +1392,7 @@ static void intel_pmu_drain_pebs_nhm(str
if (error[bit]) {
perf_log_lost_samples(event, error[bit]);

- if (perf_event_account_interrupt(event, 1))
+ if (perf_event_account_interrupt(event))
x86_pmu_stop(event, 0);
}

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1259,7 +1259,7 @@ extern void perf_event_disable(struct pe
extern void perf_event_disable_local(struct perf_event *event);
extern void perf_event_disable_inatomic(struct perf_event *event);
extern void perf_event_task_tick(void);
-extern int perf_event_account_interrupt(struct perf_event *event, int throttle);
+extern int perf_event_account_interrupt(struct perf_event *event);
#else /* !CONFIG_PERF_EVENTS: */
static inline void *
perf_aux_output_begin(struct perf_output_handle *handle,
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7034,7 +7034,7 @@ static void perf_log_itrace_start(struct
perf_output_end(&handle);
}

-int perf_event_account_interrupt(struct perf_event *event, int throttle)
+static int __perf_event_account_interrupt(struct perf_event *event, int throttle)
{
struct hw_perf_event *hwc = &event->hw;
int ret = 0;
@@ -7059,6 +7059,11 @@ int perf_event_account_interrupt(struct
return ret;
}

+int perf_event_account_interrupt(struct perf_event *event)
+{
+ return __perf_event_account_interrupt(event, 1);
+}
+
/*
* Generic event overflow handling, sampling.
*/
@@ -7078,7 +7083,7 @@ static int __perf_event_overflow(struct
if (unlikely(!is_sampling_event(event)))
return 0;

- ret = perf_event_account_interrupt(event, throttle);
+ ret = __perf_event_account_interrupt(event, throttle);

if (event->attr.freq) {
u64 now = perf_clock();

2016-12-15 07:15:21

by Jiri Olsa

[permalink] [raw]

Subject: Re: [RFC] perf/x86/intel: Account interrupts for PEBS errors

2016-12-15 15:44:19

by Jiri Olsa

[permalink] [raw]

Subject: [PATCHv2] perf/x86/intel: Account interrupts for PEBS errors

On Wed, Dec 14, 2016 at 08:32:39PM +0100, Peter Zijlstra wrote:
> On Wed, Dec 14, 2016 at 07:16:36PM +0100, Jiri Olsa wrote:
>
> > > > +++ b/arch/x86/events/intel/ds.c
> > > > @@ -1389,9 +1389,13 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
> > > > continue;
> > > >
> > > > /* log dropped samples number */
> > > > - if (error[bit])
> > > > + if (error[bit]) {
> > > > perf_log_lost_samples(event, error[bit]);
> > > >
> > > > + if (perf_event_account_interrupt(event, 1))
> > >
> > > Seems a bit daft to expose the .throttle argument, since that would be
> > > the only point of calling this.
> >
> > there's also the other caller from __perf_event_overflow
>
> See the below patchlet ;-)
>
> > > > +static int __perf_event_overflow(struct perf_event *event,
> > > > + int throttle, struct perf_sample_data *data,
> > > > + struct pt_regs *regs)
> > > > +{
> > > > + int events = atomic_read(&event->event_limit);
> > > > + struct hw_perf_event *hwc = &event->hw;
> > > > + int ret = 0;
> > > > +
> > > > + /*
> > > > + * Non-sampling counters might still use the PMI to fold short
> > > > + * hardware counters, ignore those.
> > > > + */
> > > > + if (unlikely(!is_sampling_event(event)))
> > > > + return 0;
> > > > +
> > > > + ret = perf_event_account_interrupt(event, throttle);
> > > > +
> > > > if (event->attr.freq) {
> > > > u64 now = perf_clock();
> > > > s64 delta = now - hwc->freq_time_stamp;
> > >
> > > Arguably, everything in __perf_event_overflow() except for calling of
> > > ->overflow_handler() should be done I think.
> >
> > well, I was wondering about that period adjustment bit
> >
> > but I wasn't sure about those pending_kill/pending_wakeup bits,
> > they make sense to me only if we have some data to deliver
>
> Hmm, maybe. Please add a comment, that way we can at least rediscover we
> thought about this.
>

new version with full changelog

jirka

---
It's possible to setup PEBS events and get only errors and not
a single data, like on SNB-X (model 45) and IVB-EP (model 62)
via 2 perf commands running simultaneously:

taskset -c 1 ./perf record -c 4 -e branches:pp -j any -C 10

This leads to soft lock up, because the error path of the
intel_pmu_drain_pebs_nhm does not account event->hw.interrupt
for error PEBS interrupts so the event is not eventually
stopped when it gets over the max_samples_per_tick limit.

NMI watchdog: BUG: soft lockup - CPU#22 stuck for 22s! [perf_fuzzer:5816]
...
task: ffff880273148000 task.stack: ffffc90002d58000
RIP: 0010:[<ffffffff81159232>] [<ffffffff81159232>] smp_call_function_single+0xe2/0x140
RSP: 0018:ffffc90002d5bd60 EFLAGS: 00000202
...
Call Trace:
? trace_hardirqs_on_caller+0xf5/0x1b0
? perf_cgroup_attach+0x70/0x70
perf_install_in_context+0x199/0x1b0
? ctx_resched+0x90/0x90
SYSC_perf_event_open+0x641/0xf90
SyS_perf_event_open+0x9/0x10
do_syscall_64+0x6c/0x1f0
entry_SYSCALL64_slow_path+0x25/0x25

Adding perf_event_account_interrupt with event's interrupt
and frequency checks and calling it from drain_pebs's error
path.

Keeping pending_kill and pending_wakeup check up logic only
in __perf_event_overflow path, because they make sense only
in case if there's any data to deliver.

Signed-off-by: Jiri Olsa <[email protected]>
---
arch/x86/events/intel/ds.c | 6 +++++-
include/linux/perf_event.h | 1 +
kernel/events/core.c | 47 ++++++++++++++++++++++++++++++----------------
3 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index be202390bbd3..9dfeeeca0ea8 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1389,9 +1389,13 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
continue;

/* log dropped samples number */
- if (error[bit])
+ if (error[bit]) {
perf_log_lost_samples(event, error[bit]);

+ if (perf_event_account_interrupt(event))
+ x86_pmu_stop(event, 0);
+ }
+
if (counts[bit]) {
__intel_pmu_pebs_event(event, iregs, base,
top, bit, counts[bit]);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4741ecdb9817..78ed8105e64d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1259,6 +1259,7 @@ extern void perf_event_disable(struct perf_event *event);
extern void perf_event_disable_local(struct perf_event *event);
extern void perf_event_disable_inatomic(struct perf_event *event);
extern void perf_event_task_tick(void);
+extern int perf_event_account_interrupt(struct perf_event *event);
#else /* !CONFIG_PERF_EVENTS: */
static inline void *
perf_aux_output_begin(struct perf_output_handle *handle,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 02c8421f8c01..7c6264f5deb7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7034,25 +7034,11 @@ static void perf_log_itrace_start(struct perf_event *event)
perf_output_end(&handle);
}

-/*
- * Generic event overflow handling, sampling.
- */
-
-static int __perf_event_overflow(struct perf_event *event,
- int throttle, struct perf_sample_data *data,
- struct pt_regs *regs)
+static int __perf_event_account_interrupt(struct perf_event *event, int throttle)
{
- int events = atomic_read(&event->event_limit);
struct hw_perf_event *hwc = &event->hw;
- u64 seq;
int ret = 0;
-
- /*
- * Non-sampling counters might still use the PMI to fold short
- * hardware counters, ignore those.
- */
- if (unlikely(!is_sampling_event(event)))
- return 0;
+ u64 seq;

seq = __this_cpu_read(perf_throttled_seq);
if (seq != hwc->interrupts_seq) {
@@ -7080,6 +7066,35 @@ static int __perf_event_overflow(struct perf_event *event,
perf_adjust_period(event, delta, hwc->last_period, true);
}

+ return ret;
+}
+
+int perf_event_account_interrupt(struct perf_event *event)
+{
+ return __perf_event_account_interrupt(event, 1);
+}
+
+/*
+ * Generic event overflow handling, sampling.
+ */
+
+static int __perf_event_overflow(struct perf_event *event,
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ int events = atomic_read(&event->event_limit);
+ struct hw_perf_event *hwc = &event->hw;
+ int ret = 0;
+
+ /*
+ * Non-sampling counters might still use the PMI to fold short
+ * hardware counters, ignore those.
+ */
+ if (unlikely(!is_sampling_event(event)))
+ return 0;
+
+ ret = __perf_event_account_interrupt(event, throttle);
+
/*
* XXX event_limit might not quite work as expected on inherited
* events
--
2.7.4

2016-12-15 23:42:33

by kernel test robot

[permalink] [raw]

Subject: Re: [PATCHv2] perf/x86/intel: Account interrupts for PEBS errors

Hi Jiri,

[auto build test WARNING on tip/perf/core]
[also build test WARNING on v4.9 next-20161215]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url: https://github.com/0day-ci/linux/commits/Jiri-Olsa/perf-x86-intel-Account-interrupts-for-PEBS-errors/20161216-042644
config: x86_64-randconfig-a0-12160640 (attached as .config)
compiler: gcc-4.4 (Debian 4.4.7-8) 4.4.7
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64

All warnings (new ones prefixed by >>):

kernel/events/core.c: In function '__perf_event_overflow':
>> kernel/events/core.c:7086: warning: unused variable 'hwc'
kernel/events/core.o: warning: objtool: perf_try_init_event()+0x43: function has unreachable instruction

vim +/hwc +7086 kernel/events/core.c

7070 }
7071
7072 int perf_event_account_interrupt(struct perf_event *event)
7073 {
7074 return __perf_event_account_interrupt(event, 1);
7075 }
7076
7077 /*
7078 * Generic event overflow handling, sampling.
7079 */
7080
7081 static int __perf_event_overflow(struct perf_event *event,
7082 int throttle, struct perf_sample_data *data,
7083 struct pt_regs *regs)
7084 {
7085 int events = atomic_read(&event->event_limit);
> 7086 struct hw_perf_event *hwc = &event->hw;
7087 int ret = 0;
7088
7089 /*
7090 * Non-sampling counters might still use the PMI to fold short
7091 * hardware counters, ignore those.
7092 */
7093 if (unlikely(!is_sampling_event(event)))
7094 return 0;

---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation

Attachments:

(No filename) (1.71 kB)
.config.gz (21.41 kB)
Download all attachments

2016-12-16 01:47:20

by Vince Weaver

[permalink] [raw]

Subject: Re: [PATCHv2] perf/x86/intel: Account interrupts for PEBS errors

On Thu, 15 Dec 2016, Jiri Olsa wrote:

> It's possible to setup PEBS events and get only errors and not
> a single data, like on SNB-X (model 45) and IVB-EP (model 62)
> via 2 perf commands running simultaneously:
>
> taskset -c 1 ./perf record -c 4 -e branches:pp -j any -C 10
>
> This leads to soft lock up, because the error path of the
> intel_pmu_drain_pebs_nhm does not account event->hw.interrupt
> for error PEBS interrupts so the event is not eventually
> stopped when it gets over the max_samples_per_tick limit.
>
> NMI watchdog: BUG: soft lockup - CPU#22 stuck for 22s! [perf_fuzzer:5816]
> ...
> task: ffff880273148000 task.stack: ffffc90002d58000
> RIP: 0010:[<ffffffff81159232>] [<ffffffff81159232>] smp_call_function_single+0xe2/0x140
> RSP: 0018:ffffc90002d5bd60 EFLAGS: 00000202
> ...
> Call Trace:
> ? trace_hardirqs_on_caller+0xf5/0x1b0
> ? perf_cgroup_attach+0x70/0x70
> perf_install_in_context+0x199/0x1b0
> ? ctx_resched+0x90/0x90
> SYSC_perf_event_open+0x641/0xf90
> SyS_perf_event_open+0x9/0x10
> do_syscall_64+0x6c/0x1f0
> entry_SYSCALL64_slow_path+0x25/0x25

I'll have to try this, with all the recent fixes I am down to NMI lockups
like this being the major cause of fuzzer issues on my intel machines.

My AMD and ARM machines are now fuzzing for weeks w/o problems.

I also finally got a power8 machine and it crashes really quickly when
fuzzing, but I haven't had a chance to track dthings own yet because it
sounds like a jet plane taking off and I can't really leave it fuzzing
like that when students are sitting nearby. Maybe over break.

Vince

2016-12-16 08:09:04

by Jiri Olsa

[permalink] [raw]

Subject: [PATCHv3] perf/x86/intel: Account interrupts for PEBS errors

On Fri, Dec 16, 2016 at 07:41:04AM +0800, kbuild test robot wrote:
> Hi Jiri,
>
> [auto build test WARNING on tip/perf/core]
> [also build test WARNING on v4.9 next-20161215]
> [if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
>
> url: https://github.com/0day-ci/linux/commits/Jiri-Olsa/perf-x86-intel-Account-interrupts-for-PEBS-errors/20161216-042644
> config: x86_64-randconfig-a0-12160640 (attached as .config)
> compiler: gcc-4.4 (Debian 4.4.7-8) 4.4.7
> reproduce:
> # save the attached .config to linux build tree
> make ARCH=x86_64
>
> All warnings (new ones prefixed by >>):
>
> kernel/events/core.c: In function '__perf_event_overflow':
> >> kernel/events/core.c:7086: warning: unused variable 'hwc'
> kernel/events/core.o: warning: objtool: perf_try_init_event()+0x43: function has unreachable instruction
>
> vim +/hwc +7086 kernel/events/core.c
>
> 7070 }
> 7071
> 7072 int perf_event_account_interrupt(struct perf_event *event)
> 7073 {
> 7074 return __perf_event_account_interrupt(event, 1);
> 7075 }
> 7076
> 7077 /*
> 7078 * Generic event overflow handling, sampling.
> 7079 */
> 7080
> 7081 static int __perf_event_overflow(struct perf_event *event,
> 7082 int throttle, struct perf_sample_data *data,
> 7083 struct pt_regs *regs)
> 7084 {
> 7085 int events = atomic_read(&event->event_limit);
> > 7086 struct hw_perf_event *hwc = &event->hw;
> 7087 int ret = 0;
> 7088

ugh, I guess I'm too used to the perf tool to fail build on unused variable :-\

v3 attached

thanks,
jirka

---
It's possible to setup PEBS events and get only errors and not
a single data, like on SNB-X (model 45) and IVB-EP (model 62)
via 2 perf commands running simultaneously:

taskset -c 1 ./perf record -c 4 -e branches:pp -j any -C 10

This leads to soft lock up, because the error path of the
intel_pmu_drain_pebs_nhm does not account event->hw.interrupt
for error PEBS interrupts so the event is not eventually
stopped when it gets over the max_samples_per_tick limit.

NMI watchdog: BUG: soft lockup - CPU#22 stuck for 22s! [perf_fuzzer:5816]
...
task: ffff880273148000 task.stack: ffffc90002d58000
RIP: 0010:[<ffffffff81159232>] [<ffffffff81159232>] smp_call_function_single+0xe2/0x140
RSP: 0018:ffffc90002d5bd60 EFLAGS: 00000202
...
Call Trace:
? trace_hardirqs_on_caller+0xf5/0x1b0
? perf_cgroup_attach+0x70/0x70
perf_install_in_context+0x199/0x1b0
? ctx_resched+0x90/0x90
SYSC_perf_event_open+0x641/0xf90
SyS_perf_event_open+0x9/0x10
do_syscall_64+0x6c/0x1f0
entry_SYSCALL64_slow_path+0x25/0x25

Adding perf_event_account_interrupt with event's interrupt
and frequency checks and calling it from drain_pebs's error
path.

Keeping pending_kill and pending_wakeup check up logic only
in __perf_event_overflow path, because they make sense only
in case if there's any data to deliver.

Signed-off-by: Jiri Olsa <[email protected]>
---
arch/x86/events/intel/ds.c | 6 +++++-
include/linux/perf_event.h | 1 +
kernel/events/core.c | 46 ++++++++++++++++++++++++++++++----------------
3 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index be202390bbd3..9dfeeeca0ea8 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1389,9 +1389,13 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
continue;

/* log dropped samples number */
- if (error[bit])
+ if (error[bit]) {
perf_log_lost_samples(event, error[bit]);

+ if (perf_event_account_interrupt(event))
+ x86_pmu_stop(event, 0);
+ }
+
if (counts[bit]) {
__intel_pmu_pebs_event(event, iregs, base,
top, bit, counts[bit]);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4741ecdb9817..78ed8105e64d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1259,6 +1259,7 @@ extern void perf_event_disable(struct perf_event *event);
extern void perf_event_disable_local(struct perf_event *event);
extern void perf_event_disable_inatomic(struct perf_event *event);
extern void perf_event_task_tick(void);
+extern int perf_event_account_interrupt(struct perf_event *event);
#else /* !CONFIG_PERF_EVENTS: */
static inline void *
perf_aux_output_begin(struct perf_output_handle *handle,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 02c8421f8c01..bb24d5abc40a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7034,25 +7034,11 @@ static void perf_log_itrace_start(struct perf_event *event)
perf_output_end(&handle);
}

-/*
- * Generic event overflow handling, sampling.
- */
-
-static int __perf_event_overflow(struct perf_event *event,
- int throttle, struct perf_sample_data *data,
- struct pt_regs *regs)
+static int __perf_event_account_interrupt(struct perf_event *event, int throttle)
{
- int events = atomic_read(&event->event_limit);
struct hw_perf_event *hwc = &event->hw;
- u64 seq;
int ret = 0;
-
- /*
- * Non-sampling counters might still use the PMI to fold short
- * hardware counters, ignore those.
- */
- if (unlikely(!is_sampling_event(event)))
- return 0;
+ u64 seq;

seq = __this_cpu_read(perf_throttled_seq);
if (seq != hwc->interrupts_seq) {
@@ -7080,6 +7066,34 @@ static int __perf_event_overflow(struct perf_event *event,
perf_adjust_period(event, delta, hwc->last_period, true);
}

+ return ret;
+}
+
+int perf_event_account_interrupt(struct perf_event *event)
+{
+ return __perf_event_account_interrupt(event, 1);
+}
+
+/*
+ * Generic event overflow handling, sampling.
+ */
+
+static int __perf_event_overflow(struct perf_event *event,
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ int events = atomic_read(&event->event_limit);
+ int ret = 0;
+
+ /*
+ * Non-sampling counters might still use the PMI to fold short
+ * hardware counters, ignore those.
+ */
+ if (unlikely(!is_sampling_event(event)))
+ return 0;
+
+ ret = __perf_event_account_interrupt(event, throttle);
+
/*
* XXX event_limit might not quite work as expected on inherited
* events
--
2.7.4