From: "Liang, Kan" <kan.liang@intel.com>
To: Peter Zijlstra <peterz@infradead.org>
CC: "mingo@kernel.org" <mingo@kernel.org>,
        "acme@infradead.org" <acme@infradead.org>,
        "eranian@google.com" <eranian@google.com>,
        "andi@firstfloor.org" <andi@firstfloor.org>,
        "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>
Subject: RE: [PATCH V7 3/6] perf, x86: handle multiple records in PEBS buffer
Thread-Topic: [PATCH V7 3/6] perf, x86: handle multiple records in PEBS
 buffer
Thread-Index: AQHQe310Bv0vIV2rhEqFiZATlJXnO51s7leAgACfL3A=
Date: Tue, 5 May 2015 16:30:25 +0000
Message-ID: <37D7C6CF3E00A74B8858931C1DB2F0770180BB27@SHSMSX103.ccr.corp.intel.com>
References: <1429517270-8079-1-git-send-email-kan.liang@intel.com>
 <1429517270-8079-4-git-send-email-kan.liang@intel.com>
 <20150505131637.GO23123@twins.programming.kicks-ass.net>
In-Reply-To: <20150505131637.GO23123@twins.programming.kicks-ass.net>
Accept-Language: zh-CN, en-US
Content-Language: en-US
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 8BIT
MIME-Version: 1.0
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 7045
Lines: 228


> 
> On Mon, Apr 20, 2015 at 04:07:47AM -0400, Kan Liang wrote:
> > +static inline void *
> > +get_next_pebs_record_by_bit(void *base, void *top, int bit) {
> > +	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> > +	void *at;
> > +	u64 pebs_status;
> > +
> > +	if (base == NULL)
> > +		return NULL;
> > +
> > +	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
> > +		struct pebs_record_nhm *p = at;
> > +
> > +		if (test_bit(bit, (unsigned long *)&p->status)) {
> 
> Just wondering, is that BT better than: p->state & (1 << bit) ?

Technically, I think they are same here.
test_bit looks more common, and widely used in drain_pebs functions.
So I changed it according to Andi's comment.

> 
> > +
> > +			if (p->status == (1 << bit))
> > +				return at;
> > +
> > +			/* clear non-PEBS bit and re-check */
> > +			pebs_status = p->status & cpuc->pebs_enabled;
> > +			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
> > +			if (pebs_status == (1 << bit))
> > +				return at;
> > +		}
> > +	}
> > +	return NULL;
> > +}
> > +
> >  static void __intel_pmu_pebs_event(struct perf_event *event,
> > +				   struct pt_regs *iregs,
> > +				   void *base, void *top,
> > +				   int bit, int count)
> >  {
> >  	struct perf_sample_data data;
> >  	struct pt_regs regs;
> > +	int i;
> > +	void *at = get_next_pebs_record_by_bit(base, top, bit);
> >
> > +	if (!intel_pmu_save_and_restart(event) &&
> > +	    !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
> >  		return;
> >
> > +	if (count > 1) {
> > +		for (i = 0; i < count - 1; i++) {
> > +			setup_pebs_sample_data(event, iregs, at, &data,
> &regs);
> > +			perf_event_output(event, &data, &regs);
> > +			at += x86_pmu.pebs_record_size;
> > +			at = get_next_pebs_record_by_bit(at, top, bit);
> > +		}
> > +	}
> > +
> > +	setup_pebs_sample_data(event, iregs, at, &data, &regs);
> >
> > +	/* all records are processed, handle event overflow now */
> 
> All but the last. There explicitly is one left to be able to call the overflow
> handler is there not?

Right, I will change the comments.

> 
> > +	if (perf_event_overflow(event, &data, &regs)) {
> >  		x86_pmu_stop(event, 0);
> > +		return;
> > +	}
> > +
> >  }
> >
> >  static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) @@
> > -1000,72 +1081,86 @@ static void intel_pmu_drain_pebs_core(struct
> pt_regs *iregs)
> >  	if (!event->attr.precise_ip)
> >  		return;
> >
> > +	n = (top - at) / x86_pmu.pebs_record_size;
> >  	if (n <= 0)
> >  		return;
> >
> > +	__intel_pmu_pebs_event(event, iregs, at,
> > +			       top, 0, n);
> >  }
> >
> >  static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)  {
> >  	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> >  	struct debug_store *ds = cpuc->ds;
> > +	struct perf_event *event;
> > +	void *base, *at, *top;
> >  	int bit;
> > +	int counts[MAX_PEBS_EVENTS] = {};
> >
> >  	if (!x86_pmu.pebs_active)
> >  		return;
> >
> > +	base = (struct pebs_record_nhm *)(unsigned
> > +long)ds->pebs_buffer_base;
> >  	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
> >
> >  	ds->pebs_index = ds->pebs_buffer_base;
> >
> > +	if (unlikely(base >= top))
> >  		return;
> >
> > +	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
> >  		struct pebs_record_nhm *p = at;
> >
> >  		for_each_set_bit(bit, (unsigned long *)&p->status,
> >  				 x86_pmu.max_pebs_events) {
> >  			event = cpuc->events[bit];
> >  			WARN_ON_ONCE(!event);
> >
> > +			if (event->attr.precise_ip)
> > +				break;
> > +		}
> 
> Would it make sense to delay looking for the event until you've found
> there is a single bit set -- and already know which bit that is?
>

Yes, I think we can test cpuc->pebs_enabled here.
It should be better than attr.precise_ip checking.

-       for (; at < top; at += x86_pmu.pebs_record_size) {
+       for (at = base; at < top; at += x86_pmu.pebs_record_size) {
                struct pebs_record_nhm *p = at;

                for_each_set_bit(bit, (unsigned long *)&p->status,
                                 x86_pmu.max_pebs_events) {
-                       event = cpuc->events[bit];
-                       if (!test_bit(bit, cpuc->active_mask))
-                               continue;
-
-                       WARN_ON_ONCE(!event);

-                       if (!event->attr.precise_ip)
-                               continue;
+                       if (test_bit(bit, cpuc->pebs_enabled))
+                               break;
+               }

... ...
 
+       for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
+               if (counts[bit] == 0)
                        continue;
-
-               __intel_pmu_pebs_event(event, iregs, at);
+               event = cpuc->events[bit];
+               WARN_ON_ONCE(!event);
+               WARN_ON_ONCE(!event->attr.precise_ip);
+               __intel_pmu_pebs_event(event, iregs, base,
+                                      top, bit, counts[bit]);
        }

> >
> > +		if (bit >= x86_pmu.max_pebs_events)
> > +			continue;
> > +		if (!test_bit(bit, cpuc->active_mask))
> > +			continue;
> > +		/*
> > +		 * The PEBS hardware does not deal well with the situation
> > +		 * when events happen near to each other and multiple
> bits
> > +		 * are set. But it should happen rarely.
> > +		 *
> > +		 * If these events include one PEBS and multiple non-PEBS
> > +		 * events, it doesn't impact PEBS record. The record will
> > +		 * be handled normally. (slow path)
> > +		 *
> > +		 * If these events include two or more PEBS events, the
> > +		 * records for the events can be collapsed into a single
> > +		 * one, and it's not possible to reconstruct all events
> > +		 * that caused the PEBS record. It's called collision.
> > +		 * If collision happened, the record will be dropped.
> > +		 *
> > +		 */
> > +		if (p->status != (1 << bit)) {
> > +			u64 pebs_status;
> > +
> > +			/* slow path */
> > +			pebs_status = p->status & cpuc->pebs_enabled;
> > +			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
> > +			if (pebs_status != (1 << bit)) {
> > +				perf_log_lost(event);
> 
> Does it make sense to keep an error[bit] count and only log once with the
> actual number in? -- when !0 obviously.

Yes, will do it.

Thanks,
Kan
> 
> >  				continue;
> > +			}
> >  		}
> > +		counts[bit]++;
> > +	}
> >
> > +	for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
> > +		if (counts[bit] == 0)
> >  			continue;
> > +		event = cpuc->events[bit];
> > +		__intel_pmu_pebs_event(event, iregs, base,
> > +				       top, bit, counts[bit]);
> >  	}
> >  }
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org More majordomo
> info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/