MIME-Version: 1.0
In-Reply-To: <1450227266-2501-6-git-send-email-andi@firstfloor.org>
References: <1450227266-2501-1-git-send-email-andi@firstfloor.org>
	<1450227266-2501-6-git-send-email-andi@firstfloor.org>
Date: Wed, 16 Dec 2015 06:32:50 -0800
Message-ID: <CABPqkBQ-e8DZvZ6y_QPsf2ggqJ4CcaM_EaFwMPyy80ASY=Hsgw@mail.gmail.com>
Subject: Re: [PATCH 05/10] perf, tools, stat: Basic support for TopDown in
 perf stat
From: Stephane Eranian <eranian@google.com>
To: Andi Kleen <andi@firstfloor.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>,
        Peter Zijlstra <peterz@infradead.org>, Jiri Olsa <jolsa@kernel.org>,
        Ingo Molnar <mingo@kernel.org>, LKML <linux-kernel@vger.kernel.org>,
        Namhyung Kim <namhyung@kernel.org>, Andi Kleen <ak@linux.intel.com>
Content-Type: text/plain; charset=UTF-8
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 8649
Lines: 236

On Tue, Dec 15, 2015 at 4:54 PM, Andi Kleen <andi@firstfloor.org> wrote:
> From: Andi Kleen <ak@linux.intel.com>
>
> Add basic plumbing for TopDown in perf stat
>
> Add a new --topdown options to enable events.
> When --topdown is specified set up events for all topdown
> events supported by the kernel.
> Add topdown-* as a special case to the event parser, as is
> needed for all events containing -.
>
> The actual code to compute the metrics is in follow-on patches.
>
> v2:
> Use standard sysctl read function.
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> ---
>  tools/perf/Documentation/perf-stat.txt |   8 +++
>  tools/perf/builtin-stat.c              | 100 ++++++++++++++++++++++++++++++++-
>  tools/perf/util/parse-events.l         |   1 +
>  3 files changed, 108 insertions(+), 1 deletion(-)
>
> diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
> index 58296e7..9be9ac8 100644
> --- a/tools/perf/Documentation/perf-stat.txt
> +++ b/tools/perf/Documentation/perf-stat.txt
> @@ -163,6 +163,14 @@ filter out the startup phase of the program, which is often very different.
>
>  Print statistics of transactional execution if supported.
>
> +--topdown::
> +
> +Print top down level 1 metrics if supported by the CPU. This allows to
> +determine bottle necks in the CPU pipeline for CPU bound workloads,
> +by breaking it down into frontend bound, backend bound, bad speculation
> +and retiring.  Specifying the option multiple times shows metrics even
> +if the don't cross a threshold.
> +
>  EXAMPLES
>  --------
>
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index 1faa6fc..e10198c 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -60,6 +60,7 @@
>  #include "util/thread_map.h"
>  #include "util/counts.h"
>
> +#include <api/fs/fs.h>
>  #include <stdlib.h>
>  #include <sys/prctl.h>
>  #include <locale.h>
> @@ -95,6 +96,15 @@ static const char * transaction_limited_attrs = {
>         "}"
>  };
>
> +static const char * topdown_attrs[] = {
> +       "topdown-total-slots",
> +       "topdown-fetch-bubbles",
> +       "topdown-slots-retired",
> +       "topdown-recovery-bubbles",
> +       "topdown-slots-issued",
> +       NULL,
> +};
> +
>  static struct perf_evlist      *evsel_list;
>
>  static struct target target = {
> @@ -109,6 +119,7 @@ static volatile pid_t               child_pid                       = -1;
>  static bool                    null_run                        =  false;
>  static int                     detailed_run                    =  0;
>  static bool                    transaction_run;
> +static int                     topdown_run                     = 0;
>  static bool                    big_num                         =  true;
>  static int                     big_num_opt                     =  -1;
>  static const char              *csv_sep                        = NULL;
> @@ -1283,6 +1294,8 @@ static const struct option stat_options[] = {
>                      "ms to wait before starting measurement after program start"),
>         OPT_BOOLEAN(0, "metric-only", &metric_only,
>                         "Only print computed metrics. No raw values"),
> +       OPT_INCR(0, "topdown", &topdown_run,
> +                   "measure topdown level 1 statistics"),
>         OPT_END()
>  };
>
> @@ -1380,12 +1393,68 @@ static void perf_stat__exit_aggr_mode(void)
>         cpus_aggr_map = NULL;
>  }
>
> +static void filter_events(const char **attr, char **str, bool use_group)
> +{
> +       int off = 0;
> +       int i;
> +       int len = 0;
> +       char *s;
> +
> +       for (i = 0; attr[i]; i++) {
> +               if (pmu_have_event("cpu", attr[i])) {
> +                       len += strlen(attr[i]) + 1;
> +                       attr[i - off] = attr[i];
> +               } else
> +                       off++;
> +       }
> +       attr[i - off] = NULL;
> +
> +       *str = malloc(len + 1 + 2);
> +       if (!*str)
> +               return;
> +       s = *str;
> +       if (i - off == 0) {
> +               *s = 0;
> +               return;
> +       }
> +       if (use_group)
> +               *s++ = '{';
> +       for (i = 0; attr[i]; i++) {
> +               strcpy(s, attr[i]);
> +               s += strlen(s);
> +               *s++ = ',';
> +       }
> +       if (use_group) {
> +               s[-1] = '}';
> +               *s = 0;
> +       } else
> +               s[-1] = 0;
> +}
> +
> +/*
> + * Check whether we can use a group for top down.
> + * Without a group may get bad results due to multiplexing.
> + */

That is not because you have a counter used by the NMI that
you cannot group. If HT is off you have plenty of counters to
do this.

> +static bool check_group(bool *warn)
> +{
> +       int n;
> +
> +       if (sysctl__read_int("kernel/nmi_watchdog", &n) < 0)
> +               return false;
> +       if (n > 0) {
> +               *warn = true;
> +               return false;
> +       }
> +       return true;
> +}
> +

I do not like this part very much. You are pulling in x86 specific
knowledge into
builtin_stat.c. Why not move this into an x86 specific file?

>  /*
>   * Add default attributes, if there were no attributes specified or
>   * if -d/--detailed, -d -d or -d -d -d is used:
>   */
>  static int add_default_attributes(void)
>  {
> +       int err;
>         struct perf_event_attr default_attrs[] = {
>
>    { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK             },
> @@ -1498,7 +1567,6 @@ static int add_default_attributes(void)
>                 return 0;
>
>         if (transaction_run) {
> -               int err;
>                 if (pmu_have_event("cpu", "cycles-ct") &&
>                     pmu_have_event("cpu", "el-start"))
>                         err = parse_events(evsel_list, transaction_attrs, NULL);
> @@ -1511,6 +1579,36 @@ static int add_default_attributes(void)
>                 return 0;
>         }
>
> +       if (topdown_run) {
> +               char *str = NULL;
> +               bool warn = false;
> +
> +               filter_events(topdown_attrs, &str, check_group(&warn));
> +               if (topdown_attrs[0] && str) {
> +                       if (warn)
> +                               fprintf(stderr,
> +               "nmi_watchdog enabled with topdown. May give wrong results.\n"
> +               "Disable with echo 0 > /proc/sys/kernel/nmi_watchdog\n");

This is x86 specific. Why not just try it out and in case of error
suggest checking
if pinned system-wide events exist (such as NMI watchdog on x86). that would
be more generic.

> +                       err = parse_events(evsel_list, str, NULL);
> +                       if (err) {
> +                               fprintf(stderr,
> +                                       "Cannot set up top down events %s: %d\n",
> +                                       str, err);
> +                               free(str);
> +                               return -1;
> +                       }
> +               } else {
> +                       fprintf(stderr, "System does not support topdown\n");
> +                       return -1;
> +               }
> +               free(str);
> +               /*
> +                * Right now combining with the other attributes breaks group
> +                * semantics.
> +                */
> +               return 0;
> +       }
> +
>         if (!evsel_list->nr_entries) {
>                 if (perf_evlist__add_default_attrs(evsel_list, default_attrs) < 0)
>                         return -1;
> diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
> index 58c5831..3e65d61 100644
> --- a/tools/perf/util/parse-events.l
> +++ b/tools/perf/util/parse-events.l
> @@ -248,6 +248,7 @@ cycles-ct                                   { return str(yyscanner, PE_KERNEL_PMU_EVENT); }
>  cycles-t                                       { return str(yyscanner, PE_KERNEL_PMU_EVENT); }
>  mem-loads                                      { return str(yyscanner, PE_KERNEL_PMU_EVENT); }
>  mem-stores                                     { return str(yyscanner, PE_KERNEL_PMU_EVENT); }
> +topdown-[a-z-]+                                        { return str(yyscanner, PE_KERNEL_PMU_EVENT); }
>
>  L1-dcache|l1-d|l1d|L1-data             |
>  L1-icache|l1-i|l1i|L1-instruction      |
> --
> 2.4.3
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/