2021-05-12 18:48:32

by kajoljain

[permalink] [raw]
Subject: [RFC 1/4] drivers/nvdimm: Add perf interface to expose nvdimm performance stats

Patch adds performance stats reporting support for nvdimm.
Added interface includes support for a pmu register function and
callbacks to be used by the arch/platform specific drivers.
User could use the standard perf tool to access perf events exposed
via pmu.

A structure is added called nvdimm_pmu which can be used to add
platform specific data like supported events and callbacks to pmu
functions like event_init/add/delete/read. It also adds
unregister_nvdimm_pmu function to handle unregistering of a pmu device.

Signed-off-by: Kajol Jain <[email protected]>
---
drivers/nvdimm/Makefile | 1 +
drivers/nvdimm/nd_perf.c | 111 +++++++++++++++++++++++++++++++++++++++
include/linux/nd.h | 31 +++++++++++
3 files changed, 143 insertions(+)
create mode 100644 drivers/nvdimm/nd_perf.c

diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index 29203f3d3069..25dba6095612 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -18,6 +18,7 @@ nd_e820-y := e820.o
libnvdimm-y := core.o
libnvdimm-y += bus.o
libnvdimm-y += dimm_devs.o
+libnvdimm-y += nd_perf.o
libnvdimm-y += dimm.o
libnvdimm-y += region_devs.o
libnvdimm-y += region.o
diff --git a/drivers/nvdimm/nd_perf.c b/drivers/nvdimm/nd_perf.c
new file mode 100644
index 000000000000..d28bec2b61a2
--- /dev/null
+++ b/drivers/nvdimm/nd_perf.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * nd_perf.c: NVDIMM Device Performance Monitoring Unit support
+ *
+ * Perf interface to expose nvdimm performance stats.
+ *
+ * Copyright (C) 2021 IBM Corporation
+ */
+
+#define pr_fmt(fmt) "nvdimm_pmu: " fmt
+
+#include <linux/nd.h>
+
+#define to_nvdimm_pmu(_pmu) container_of(_pmu, struct nvdimm_pmu, pmu)
+
+static int nvdimm_pmu_event_init(struct perf_event *event)
+{
+ struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
+
+ /* test the event attr type for PMU enumeration */
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ /* it does not support event sampling mode */
+ if (is_sampling_event(event))
+ return -EINVAL;
+
+ /* no branch sampling */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ /* jump to arch/platform specific callbacks if any */
+ if (nd_pmu && nd_pmu->event_init)
+ return nd_pmu->event_init(event, nd_pmu->dev);
+
+ return 0;
+}
+
+static void nvdimm_pmu_read(struct perf_event *event)
+{
+ struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
+
+ /* jump to arch/platform specific callbacks if any */
+ if (nd_pmu && nd_pmu->read)
+ nd_pmu->read(event, nd_pmu->dev);
+}
+
+static void nvdimm_pmu_del(struct perf_event *event, int flags)
+{
+ struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
+
+ /* jump to arch/platform specific callbacks if any */
+ if (nd_pmu && nd_pmu->del)
+ nd_pmu->del(event, flags, nd_pmu->dev);
+}
+
+static int nvdimm_pmu_add(struct perf_event *event, int flags)
+{
+ struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
+
+ if (flags & PERF_EF_START)
+ /* jump to arch/platform specific callbacks if any */
+ if (nd_pmu && nd_pmu->add)
+ return nd_pmu->add(event, flags, nd_pmu->dev);
+ return 0;
+}
+
+int register_nvdimm_pmu(struct nvdimm_pmu *nd_pmu, struct platform_device *pdev)
+{
+ int rc;
+
+ if (!nd_pmu || !pdev)
+ return -EINVAL;
+
+ nd_pmu->pmu.task_ctx_nr = perf_invalid_context;
+ nd_pmu->pmu.event_init = nvdimm_pmu_event_init;
+ nd_pmu->pmu.add = nvdimm_pmu_add;
+ nd_pmu->pmu.del = nvdimm_pmu_del;
+ nd_pmu->pmu.read = nvdimm_pmu_read;
+ nd_pmu->pmu.name = nd_pmu->name;
+ nd_pmu->pmu.attr_groups = nd_pmu->attr_groups;
+ nd_pmu->pmu.capabilities = PERF_PMU_CAP_NO_INTERRUPT |
+ PERF_PMU_CAP_NO_EXCLUDE;
+
+ /*
+ * Adding platform_device->dev pointer to nvdimm_pmu, so that we can
+ * access that device data in PMU callbacks and also pass it to
+ * arch/platform specific code.
+ */
+ nd_pmu->dev = &pdev->dev;
+
+ rc = perf_pmu_register(&nd_pmu->pmu, nd_pmu->name, -1);
+ if (rc)
+ return rc;
+
+ pr_info("%s NVDIMM performance monitor support registered\n",
+ nd_pmu->name);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(register_nvdimm_pmu);
+
+void unregister_nvdimm_pmu(struct pmu *nd_pmu)
+{
+ /*
+ * nd_pmu will get free in arch/platform specific code once
+ * corresponding pmu get unregistered.
+ */
+ perf_pmu_unregister(nd_pmu);
+}
+EXPORT_SYMBOL_GPL(unregister_nvdimm_pmu);
diff --git a/include/linux/nd.h b/include/linux/nd.h
index ee9ad76afbba..fa6e60b2b368 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -8,6 +8,8 @@
#include <linux/ndctl.h>
#include <linux/device.h>
#include <linux/badblocks.h>
+#include <linux/platform_device.h>
+#include <linux/perf_event.h>

enum nvdimm_event {
NVDIMM_REVALIDATE_POISON,
@@ -23,6 +25,35 @@ enum nvdimm_claim_class {
NVDIMM_CCLASS_UNKNOWN,
};

+/**
+ * struct nvdimm_pmu - data structure for nvdimm perf driver
+ *
+ * @name: name of the nvdimm pmu device.
+ * @pmu: pmu data structure for nvdimm performance stats.
+ * @cpu: designated cpu for counter access.
+ * @dev: nvdimm device pointer.
+ * @functions(event_init/add/del/read): platform specific callbacks.
+ * @attr_groups: data structure for events/formats/cpumask.
+ * @node: node for cpu hotplug notifier link.
+ * @cpuhp_state: state for cpu hotplug notification.
+ */
+struct nvdimm_pmu {
+ const char *name;
+ struct pmu pmu;
+ int cpu;
+ struct device *dev;
+ int (*event_init)(struct perf_event *event, struct device *dev);
+ int (*add)(struct perf_event *event, int flags, struct device *dev);
+ void (*del)(struct perf_event *event, int flags, struct device *dev);
+ void (*read)(struct perf_event *event, struct device *dev);
+ const struct attribute_group **attr_groups;
+ struct hlist_node node;
+ enum cpuhp_state cpuhp_state;
+};
+
+int register_nvdimm_pmu(struct nvdimm_pmu *nvdimm, struct platform_device *pdev);
+void unregister_nvdimm_pmu(struct pmu *pmu);
+
struct nd_device_driver {
struct device_driver drv;
unsigned long type;
--
2.27.0


2021-05-12 19:49:48

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC 1/4] drivers/nvdimm: Add perf interface to expose nvdimm performance stats

On Wed, May 12, 2021 at 10:08:21PM +0530, Kajol Jain wrote:
> +static void nvdimm_pmu_read(struct perf_event *event)
> +{
> + struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
> +
> + /* jump to arch/platform specific callbacks if any */
> + if (nd_pmu && nd_pmu->read)
> + nd_pmu->read(event, nd_pmu->dev);
> +}
> +
> +static void nvdimm_pmu_del(struct perf_event *event, int flags)
> +{
> + struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
> +
> + /* jump to arch/platform specific callbacks if any */
> + if (nd_pmu && nd_pmu->del)
> + nd_pmu->del(event, flags, nd_pmu->dev);
> +}
> +
> +static int nvdimm_pmu_add(struct perf_event *event, int flags)
> +{
> + struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
> +
> + if (flags & PERF_EF_START)
> + /* jump to arch/platform specific callbacks if any */
> + if (nd_pmu && nd_pmu->add)
> + return nd_pmu->add(event, flags, nd_pmu->dev);
> + return 0;
> +}

What's the value add here? Why can't you directly set driver pointers? I
also don't really believe ->{add,del,read} can be optional and still
have a sane driver.

2021-05-13 20:24:42

by kajoljain

[permalink] [raw]
Subject: Re: [RFC 1/4] drivers/nvdimm: Add perf interface to expose nvdimm performance stats



On 5/12/21 10:57 PM, Peter Zijlstra wrote:
> On Wed, May 12, 2021 at 10:08:21PM +0530, Kajol Jain wrote:
>> +static void nvdimm_pmu_read(struct perf_event *event)
>> +{
>> + struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
>> +
>> + /* jump to arch/platform specific callbacks if any */
>> + if (nd_pmu && nd_pmu->read)
>> + nd_pmu->read(event, nd_pmu->dev);
>> +}
>> +
>> +static void nvdimm_pmu_del(struct perf_event *event, int flags)
>> +{
>> + struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
>> +
>> + /* jump to arch/platform specific callbacks if any */
>> + if (nd_pmu && nd_pmu->del)
>> + nd_pmu->del(event, flags, nd_pmu->dev);
>> +}
>> +
>> +static int nvdimm_pmu_add(struct perf_event *event, int flags)
>> +{
>> + struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
>> +
>> + if (flags & PERF_EF_START)
>> + /* jump to arch/platform specific callbacks if any */
>> + if (nd_pmu && nd_pmu->add)
>> + return nd_pmu->add(event, flags, nd_pmu->dev);
>> + return 0;
>> +}
>
> What's the value add here? Why can't you directly set driver pointers? I
> also don't really believe ->{add,del,read} can be optional and still
> have a sane driver.
>

Hi Peter,

The intend for adding these callbacks is to give flexibility to the
arch/platform specific driver code to use its own routine for getting
counter data or specific checks/operations. Arch/platform driver code
would have different method to get the counter data like IBM pseries
nmem* device which uses a hypervisor call(hcall).

But yes the current read/add/del functions are not adding value. We
could add an arch/platform specific function which could handle the
capturing of the counter data and do the rest of the operation here,
is this approach better?

Thanks,
Kajol Jain




2021-05-14 17:50:59

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC 1/4] drivers/nvdimm: Add perf interface to expose nvdimm performance stats

On Thu, May 13, 2021 at 05:56:14PM +0530, kajoljain wrote:

> But yes the current read/add/del functions are not adding value. We
> could add an arch/platform specific function which could handle the
> capturing of the counter data and do the rest of the operation here,
> is this approach better?

Right; have your register_nvdimm_pmu() set pmu->{add,del,read} to
nd_pmu->{add,del,read} directly, don't bother with these intermediates.
Also you can WARN_ON_ONCE() if any of them are NULL and fail
registration at that point.

2021-05-17 06:47:46

by kajoljain

[permalink] [raw]
Subject: Re: [RFC 1/4] drivers/nvdimm: Add perf interface to expose nvdimm performance stats



On 5/14/21 5:17 PM, Peter Zijlstra wrote:
> On Thu, May 13, 2021 at 05:56:14PM +0530, kajoljain wrote:
>
>> But yes the current read/add/del functions are not adding value. We
>> could add an arch/platform specific function which could handle the
>> capturing of the counter data and do the rest of the operation here,
>> is this approach better?
>
> Right; have your register_nvdimm_pmu() set pmu->{add,del,read} to
> nd_pmu->{add,del,read} directly, don't bother with these intermediates.
> Also you can WARN_ON_ONCE() if any of them are NULL and fail
> registration at that point.
>

Hi Peter,
I will make all required changes and send next version of this patchset soon.

Thanks,
Kajol Jain