Date: Tue, 5 May 2015 15:16:37 +0200
From: Peter Zijlstra <peterz@infradead.org>
To: Kan Liang <kan.liang@intel.com>
Cc: mingo@kernel.org, acme@infradead.org, eranian@google.com,
        andi@firstfloor.org, linux-kernel@vger.kernel.org
Subject: Re: [PATCH V7 3/6] perf, x86: handle multiple records in PEBS buffer
Message-ID: <20150505131637.GO23123@twins.programming.kicks-ass.net>
References: <1429517270-8079-1-git-send-email-kan.liang@intel.com>
 <1429517270-8079-4-git-send-email-kan.liang@intel.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <1429517270-8079-4-git-send-email-kan.liang@intel.com>
User-Agent: Mutt/1.5.21 (2012-12-30)
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 4963
Lines: 169

On Mon, Apr 20, 2015 at 04:07:47AM -0400, Kan Liang wrote:
> +static inline void *
> +get_next_pebs_record_by_bit(void *base, void *top, int bit)
> +{
> +	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> +	void *at;
> +	u64 pebs_status;
> +
> +	if (base == NULL)
> +		return NULL;
> +
> +	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
> +		struct pebs_record_nhm *p = at;
> +
> +		if (test_bit(bit, (unsigned long *)&p->status)) {

Just wondering, is that BT better than: p->state & (1 << bit) ?

> +
> +			if (p->status == (1 << bit))
> +				return at;
> +
> +			/* clear non-PEBS bit and re-check */
> +			pebs_status = p->status & cpuc->pebs_enabled;
> +			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
> +			if (pebs_status == (1 << bit))
> +				return at;
> +		}
> +	}
> +	return NULL;
> +}
> +
>  static void __intel_pmu_pebs_event(struct perf_event *event,
> +				   struct pt_regs *iregs,
> +				   void *base, void *top,
> +				   int bit, int count)
>  {
>  	struct perf_sample_data data;
>  	struct pt_regs regs;
> +	int i;
> +	void *at = get_next_pebs_record_by_bit(base, top, bit);
>  
> +	if (!intel_pmu_save_and_restart(event) &&
> +	    !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
>  		return;
>  
> +	if (count > 1) {
> +		for (i = 0; i < count - 1; i++) {
> +			setup_pebs_sample_data(event, iregs, at, &data, &regs);
> +			perf_event_output(event, &data, &regs);
> +			at += x86_pmu.pebs_record_size;
> +			at = get_next_pebs_record_by_bit(at, top, bit);
> +		}
> +	}
> +
> +	setup_pebs_sample_data(event, iregs, at, &data, &regs);
>  
> +	/* all records are processed, handle event overflow now */

All but the last. There explicitly is one left to be able to call the
overflow handler is there not?

> +	if (perf_event_overflow(event, &data, &regs)) {
>  		x86_pmu_stop(event, 0);
> +		return;
> +	}
> +
>  }
>  
>  static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
> @@ -1000,72 +1081,86 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
>  	if (!event->attr.precise_ip)
>  		return;
>  
> +	n = (top - at) / x86_pmu.pebs_record_size;
>  	if (n <= 0)
>  		return;
>  
> +	__intel_pmu_pebs_event(event, iregs, at,
> +			       top, 0, n);
>  }
>  
>  static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
>  {
>  	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
>  	struct debug_store *ds = cpuc->ds;
> +	struct perf_event *event;
> +	void *base, *at, *top;
>  	int bit;
> +	int counts[MAX_PEBS_EVENTS] = {};
>  
>  	if (!x86_pmu.pebs_active)
>  		return;
>  
> +	base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
>  	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
>  
>  	ds->pebs_index = ds->pebs_buffer_base;
>  
> +	if (unlikely(base >= top))
>  		return;
>  
> +	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
>  		struct pebs_record_nhm *p = at;
>  
>  		for_each_set_bit(bit, (unsigned long *)&p->status,
>  				 x86_pmu.max_pebs_events) {
>  			event = cpuc->events[bit];
>  			WARN_ON_ONCE(!event);
>  
> +			if (event->attr.precise_ip)
> +				break;
> +		}

Would it make sense to delay looking for the event until you've found
there is a single bit set -- and already know which bit that is?

>  
> +		if (bit >= x86_pmu.max_pebs_events)
> +			continue;
> +		if (!test_bit(bit, cpuc->active_mask))
> +			continue;
> +		/*
> +		 * The PEBS hardware does not deal well with the situation
> +		 * when events happen near to each other and multiple bits
> +		 * are set. But it should happen rarely.
> +		 *
> +		 * If these events include one PEBS and multiple non-PEBS
> +		 * events, it doesn't impact PEBS record. The record will
> +		 * be handled normally. (slow path)
> +		 *
> +		 * If these events include two or more PEBS events, the
> +		 * records for the events can be collapsed into a single
> +		 * one, and it's not possible to reconstruct all events
> +		 * that caused the PEBS record. It's called collision.
> +		 * If collision happened, the record will be dropped.
> +		 *
> +		 */
> +		if (p->status != (1 << bit)) {
> +			u64 pebs_status;
> +
> +			/* slow path */
> +			pebs_status = p->status & cpuc->pebs_enabled;
> +			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
> +			if (pebs_status != (1 << bit)) {
> +				perf_log_lost(event);

Does it make sense to keep an error[bit] count and only log once with
the actual number in? -- when !0 obviously.

>  				continue;
> +			}
>  		}
> +		counts[bit]++;
> +	}
>  
> +	for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
> +		if (counts[bit] == 0)
>  			continue;
> +		event = cpuc->events[bit];
> +		__intel_pmu_pebs_event(event, iregs, base,
> +				       top, bit, counts[bit]);
>  	}
>  }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/