On Fri, May 26, 2023 at 3:00 AM Jonathan Cameron
<[email protected]> wrote:
>
> CXL rev 3.0 introduces a standard performance monitoring hardware
> block to CXL. Instances are discovered using CXL Register Locator DVSEC
> entries. Each CXL component may have multiple PMUs.
>
> This initial driver supports a subset of types of counter.
> It supports counters that are either fixed or configurable, but requires
> that they support the ability to freeze and write value whilst frozen.
>
> Development done with QEMU model which will be posted shortly.
>
> Example:
>
> $ perf stat -a -e cxl_pmu_mem0.0/h2d_req_snpcur/ -e cxl_pmu_mem0.0/h2d_req_snpdata/ -e cxl_pmu_mem0.0/clock_ticks/ sleep 1
>
> Performance counter stats for 'system wide':
>
> 96,757,023,244,321 cxl_pmu_mem0.0/h2d_req_snpcur/
> 96,757,023,244,365 cxl_pmu_mem0.0/h2d_req_snpdata/
> 193,514,046,488,653 cxl_pmu_mem0.0/clock_ticks/
>
> 1.090539600 seconds time elapsed
>
> Reviewed-by: Dave Jiang <[email protected]>
> Reviewed-by: Kan Liang <[email protected]>
> Signed-off-by: Jonathan Cameron <[email protected]>
> ---
> v7: THanks to Namhyung and Stephane for reviews
> - Set the cpu for an event only after validation checks.
> - Added -a to make perf default of all CPUs explicit in command line.
> - Fix wrong event names in example.
> ---
[SNIP]
> +static void cxl_pmu_event_start(struct perf_event *event, int flags)
> +{
> + struct cxl_pmu_info *info = pmu_to_cxl_pmu_info(event->pmu);
> + struct hw_perf_event *hwc = &event->hw;
> + void __iomem *base = info->base;
> + u64 cfg;
> +
> + /*
> + * All paths to here should either set these flags directly or
> + * call cxl_pmu_event_stop() which will ensure the correct state.
> + */
> + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
> + return;
> +
> + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
> + hwc->state = 0;
> +
> + /*
> + * Currently only hdm filter control is implemnted, this code will
Typo: implemented
> + * want generalizing when more filters are added.
> + */
> + if (info->filter_hdm) {
> + if (cxl_pmu_config1_hdm_filter_en(event))
> + cfg = cxl_pmu_config2_get_hdm_decoder(event);
> + else
> + cfg = GENMASK(15, 0); /* No filtering if 0xFFFF_FFFF */
> + writeq(cfg, base + CXL_PMU_FILTER_CFG_REG(hwc->idx, 0));
> + }
> +
> + cfg = readq(base + CXL_PMU_COUNTER_CFG_REG(hwc->idx));
> + cfg |= FIELD_PREP(CXL_PMU_COUNTER_CFG_INT_ON_OVRFLW, 1);
> + cfg |= FIELD_PREP(CXL_PMU_COUNTER_CFG_FREEZE_ON_OVRFLW, 1);
> + cfg |= FIELD_PREP(CXL_PMU_COUNTER_CFG_ENABLE, 1);
> + cfg |= FIELD_PREP(CXL_PMU_COUNTER_CFG_EDGE,
> + cxl_pmu_config1_get_edge(event) ? 1 : 0);
> + cfg |= FIELD_PREP(CXL_PMU_COUNTER_CFG_INVERT,
> + cxl_pmu_config1_get_invert(event) ? 1 : 0);
> +
> + /* Fixed purpose counters have next two fields RO */
> + if (test_bit(hwc->idx, info->conf_counter_bm)) {
> + cfg |= FIELD_PREP(CXL_PMU_COUNTER_CFG_EVENT_GRP_ID_IDX_MSK,
> + hwc->event_base);
> + cfg |= FIELD_PREP(CXL_PMU_COUNTER_CFG_EVENTS_MSK,
> + cxl_pmu_config_get_mask(event));
> + }
> + cfg &= ~CXL_PMU_COUNTER_CFG_THRESHOLD_MSK;
> + /*
> + * For events that generate only 1 count per clock the CXL 3.0 spec
> + * states the threshold shall be set to 1 but if set to 0 it will
> + * count the raw value anwyay?
Typo: anyway?
> + * There is no definition of what events will count multiple per cycle
> + * and hence to which non 1 values of threshold can apply.
> + * (CXL 3.0 8.2.7.2.1 Counter Configuration - threshold field definition)
> + */
> + cfg |= FIELD_PREP(CXL_PMU_COUNTER_CFG_THRESHOLD_MSK,
> + cxl_pmu_config1_get_threshold(event));
> + writeq(cfg, base + CXL_PMU_COUNTER_CFG_REG(hwc->idx));
> +
> + local64_set(&hwc->prev_count, 0);
> + writeq(0, base + CXL_PMU_COUNTER_REG(hwc->idx));
> +
> + perf_event_update_userpage(event);
> +}
> +
[SNIP]
> +
> +static int cxl_pmu_online_cpu(unsigned int cpu, struct hlist_node *node)
> +{
> + struct cxl_pmu_info *info = hlist_entry_safe(node, struct cxl_pmu_info, node);
> +
> + if (info->on_cpu != -1)
> + return 0;
> +
> + info->on_cpu = cpu;
> + /*
> + * CPU HP lock is held so we should be guaranteed that the CPU hasn't yet
> + * gone away again.
> + */
> + WARN_ON(irq_set_affinity(info->irq, cpumask_of(cpu)));
> +
> + return 0;
> +}
> +
> +static int cxl_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
> +{
> + struct cxl_pmu_info *info = hlist_entry_safe(node, struct cxl_pmu_info, node);
> + unsigned int target;
> +
> + if (info->on_cpu != cpu)
> + return 0;
> +
> + info->on_cpu = -1;
> + target = cpumask_any_but(cpu_online_mask, cpu);
> + if (target >= nr_cpu_ids) {
> + dev_err(info->pmu.dev, "Unable to find a suitable CPU\n");
> + return 0;
> + }
> +
> + perf_pmu_migrate_context(&info->pmu, cpu, target);
> + info->on_cpu = target;
> + /*
> + * CPU HP lock is held so we should be guaranteed that this CPU hasn't yet
> + * gone away.
> + */
> + WARN_ON(irq_set_affinity(info->irq, cpumask_of(target)));
> +
> + return 0;
> +}
IIUC a CXL PMU hardware (say cxl_pmu_mem0.0) is shared across
all CPUs and it would return the same value when read from any CPU,
right?
Thanks,
Namhyung
> +
> +static __init int cxl_pmu_init(void)
> +{
> + int rc;
> +
> + rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
> + "AP_PERF_CXL_PMU_ONLINE",
> + cxl_pmu_online_cpu, cxl_pmu_offline_cpu);
> + if (rc < 0)
> + return rc;
> + cxl_pmu_cpuhp_state_num = rc;
> +
> + rc = cxl_driver_register(&cxl_pmu_driver);
> + if (rc)
> + cpuhp_remove_multi_state(cxl_pmu_cpuhp_state_num);
> +
> + return rc;
> +}
> +
> +static __exit void cxl_pmu_exit(void)
> +{
> + cxl_driver_unregister(&cxl_pmu_driver);
> + cpuhp_remove_multi_state(cxl_pmu_cpuhp_state_num);
> +}
> +
> +MODULE_LICENSE("GPL");
> +MODULE_IMPORT_NS(CXL);
> +module_init(cxl_pmu_init);
> +module_exit(cxl_pmu_exit);
> +MODULE_ALIAS_CXL(CXL_DEVICE_PMU);
> --
> 2.39.2
>
Hi,
Tidied up the typos. Thanks,
> > +static int cxl_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
> > +{
> > + struct cxl_pmu_info *info = hlist_entry_safe(node, struct cxl_pmu_info, node);
> > + unsigned int target;
> > +
> > + if (info->on_cpu != cpu)
> > + return 0;
> > +
> > + info->on_cpu = -1;
> > + target = cpumask_any_but(cpu_online_mask, cpu);
> > + if (target >= nr_cpu_ids) {
> > + dev_err(info->pmu.dev, "Unable to find a suitable CPU\n");
> > + return 0;
> > + }
> > +
> > + perf_pmu_migrate_context(&info->pmu, cpu, target);
> > + info->on_cpu = target;
> > + /*
> > + * CPU HP lock is held so we should be guaranteed that this CPU hasn't yet
> > + * gone away.
> > + */
> > + WARN_ON(irq_set_affinity(info->irq, cpumask_of(target)));
> > +
> > + return 0;
> > +}
>
> IIUC a CXL PMU hardware (say cxl_pmu_mem0.0) is shared across
> all CPUs and it would return the same value when read from any CPU,
> right?
Correct, it will return the same value when used from any CPU.
I'm not sure what issue you are indicating.
My understanding is that, even for such cases, perf uses percpu
variables that mean we still have to ensure that the interrupt
handling occurs on the CPU we have migrated the context to.
There are a lot of similar driver in perf already from a quick
git grep cpumask_any_but\(cpu_online_mask,
It might be nice to enable perf to operate for these devices without
the percpu context though. I haven't looked into whether that
is worth doing.
Jonathan
>
> Thanks,
> Namhyung
>
>
> > +
> > +static __init int cxl_pmu_init(void)
> > +{
> > + int rc;
> > +
> > + rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
> > + "AP_PERF_CXL_PMU_ONLINE",
> > + cxl_pmu_online_cpu, cxl_pmu_offline_cpu);
> > + if (rc < 0)
> > + return rc;
> > + cxl_pmu_cpuhp_state_num = rc;
> > +
> > + rc = cxl_driver_register(&cxl_pmu_driver);
> > + if (rc)
> > + cpuhp_remove_multi_state(cxl_pmu_cpuhp_state_num);
> > +
> > + return rc;
> > +}
> > +
> > +static __exit void cxl_pmu_exit(void)
> > +{
> > + cxl_driver_unregister(&cxl_pmu_driver);
> > + cpuhp_remove_multi_state(cxl_pmu_cpuhp_state_num);
> > +}
> > +
> > +MODULE_LICENSE("GPL");
> > +MODULE_IMPORT_NS(CXL);
> > +module_init(cxl_pmu_init);
> > +module_exit(cxl_pmu_exit);
> > +MODULE_ALIAS_CXL(CXL_DEVICE_PMU);
> > --
> > 2.39.2
> >
>
Jonathan Cameron wrote:
>
> Hi,
>
> Tidied up the typos. Thanks,
Care to send that replacement patch? It looks like, in linux-next, no
other tree has picked up:
http://lore.kernel.org/r/[email protected]
...so I'll queue that as well.
On Tue, May 30, 2023 at 6:50 AM Jonathan Cameron
<[email protected]> wrote:
>
>
> Hi,
>
> Tidied up the typos. Thanks,
>
> > > +static int cxl_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
> > > +{
> > > + struct cxl_pmu_info *info = hlist_entry_safe(node, struct cxl_pmu_info, node);
> > > + unsigned int target;
> > > +
> > > + if (info->on_cpu != cpu)
> > > + return 0;
> > > +
> > > + info->on_cpu = -1;
> > > + target = cpumask_any_but(cpu_online_mask, cpu);
> > > + if (target >= nr_cpu_ids) {
> > > + dev_err(info->pmu.dev, "Unable to find a suitable CPU\n");
> > > + return 0;
> > > + }
> > > +
> > > + perf_pmu_migrate_context(&info->pmu, cpu, target);
> > > + info->on_cpu = target;
> > > + /*
> > > + * CPU HP lock is held so we should be guaranteed that this CPU hasn't yet
> > > + * gone away.
> > > + */
> > > + WARN_ON(irq_set_affinity(info->irq, cpumask_of(target)));
> > > +
> > > + return 0;
> > > +}
> >
> > IIUC a CXL PMU hardware (say cxl_pmu_mem0.0) is shared across
> > all CPUs and it would return the same value when read from any CPU,
> > right?
>
> Correct, it will return the same value when used from any CPU.
> I'm not sure what issue you are indicating.
>
> My understanding is that, even for such cases, perf uses percpu
> variables that mean we still have to ensure that the interrupt
> handling occurs on the CPU we have migrated the context to.
>
> There are a lot of similar driver in perf already from a quick
> git grep cpumask_any_but\(cpu_online_mask,
>
> It might be nice to enable perf to operate for these devices without
> the percpu context though. I haven't looked into whether that
> is worth doing.
I was thinking if we could avoid IPIs for read from other
CPUs. It has PERF_EV_CAP_READ_ACTIVE_PKG for some uncore events
which can be shared among CPUs in the same package
to skip the IPI and to read it from the local CPU.
But I think this can be a separate step.
Thanks,
Namhyung