2021-01-08 15:38:36

by Steve Wahl

[permalink] [raw]
Subject: [PATCH 0/2] perf/x86/intel/uncore: Derive die id from NUMA info with more than 8 nodes

For Intel uncore, the registers being used to identify the die don't
contain enough bits to uniquely identify more than 8 dies. On
systems with more than 8 dies, this results in error messages of the
form "skx_uncore: probe of XXXX:XX:XX.X failed with error -22", and
some perf counters showing up as "<not supported>".

On such systems, use NUMA information to determine die id.

Continue to use the register information with 8 or fewer numa nodes to
cover cases like NUMA not being enabled.

The first patch moves translation from physical to logical die id
earlier in the code, and stores only the logical id. The logical id
is the only one that is really used. Without this change the second
patch would have to store both physical and logical id, which was much
more complicated.

The second patch adds the alternative of deriving the logical die id
from the NUMA information when there are more than 8 nodes.

Steve Wahl (2):
perf/x86/intel/uncore: Store the logical die id instead of the
physical die id.
perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA
info

arch/x86/events/intel/uncore.c | 58 +++++---------
arch/x86/events/intel/uncore.h | 5 +-
arch/x86/events/intel/uncore_snb.c | 2 +-
arch/x86/events/intel/uncore_snbep.c | 114 ++++++++++++++++++---------
4 files changed, 99 insertions(+), 80 deletions(-)

--
2.26.2


2021-01-08 15:38:46

by Steve Wahl

[permalink] [raw]
Subject: [PATCH 1/2] perf/x86/intel/uncore: Store the logical die id instead of the physical die id.

The phys_id isn't really used other than to map to a logical die id.
Calculate the logical die id earlier, and store that instead of the
phys_id.

Signed-off-by: Steve Wahl <[email protected]>
---
arch/x86/events/intel/uncore.c | 58 ++++++++++------------------
arch/x86/events/intel/uncore.h | 5 +--
arch/x86/events/intel/uncore_snb.c | 2 +-
arch/x86/events/intel/uncore_snbep.c | 31 +++++++--------
4 files changed, 39 insertions(+), 57 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 357258f82dc8..33c8180d5a87 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -31,21 +31,21 @@ struct event_constraint uncore_constraint_empty =

MODULE_LICENSE("GPL");

-int uncore_pcibus_to_physid(struct pci_bus *bus)
+int uncore_pcibus_to_dieid(struct pci_bus *bus)
{
struct pci2phy_map *map;
- int phys_id = -1;
+ int die_id = -1;

raw_spin_lock(&pci2phy_map_lock);
list_for_each_entry(map, &pci2phy_map_head, list) {
if (map->segment == pci_domain_nr(bus)) {
- phys_id = map->pbus_to_physid[bus->number];
+ die_id = map->pbus_to_dieid[bus->number];
break;
}
}
raw_spin_unlock(&pci2phy_map_lock);

- return phys_id;
+ return die_id;
}

static void uncore_free_pcibus_map(void)
@@ -86,7 +86,7 @@ struct pci2phy_map *__find_pci2phy_map(int segment)
alloc = NULL;
map->segment = segment;
for (i = 0; i < 256; i++)
- map->pbus_to_physid[i] = -1;
+ map->pbus_to_dieid[i] = -1;
list_add_tail(&map->list, &pci2phy_map_head);

end:
@@ -332,7 +332,6 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,

uncore_pmu_init_hrtimer(box);
box->cpu = -1;
- box->pci_phys_id = -1;
box->dieid = -1;

/* set default hrtimer timeout */
@@ -993,18 +992,11 @@ uncore_types_init(struct intel_uncore_type **types, bool setid)
/*
* Get the die information of a PCI device.
* @pdev: The PCI device.
- * @phys_id: The physical socket id which the device maps to.
* @die: The die id which the device maps to.
*/
-static int uncore_pci_get_dev_die_info(struct pci_dev *pdev,
- int *phys_id, int *die)
+static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, int *die)
{
- *phys_id = uncore_pcibus_to_physid(pdev->bus);
- if (*phys_id < 0)
- return -ENODEV;
-
- *die = (topology_max_die_per_package() > 1) ? *phys_id :
- topology_phys_to_logical_pkg(*phys_id);
+ *die = uncore_pcibus_to_dieid(pdev->bus);
if (*die < 0)
return -EINVAL;

@@ -1046,13 +1038,12 @@ uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
* @pdev: The PCI device.
* @type: The corresponding PMU type of the device.
* @pmu: The corresponding PMU of the device.
- * @phys_id: The physical socket id which the device maps to.
* @die: The die id which the device maps to.
*/
static int uncore_pci_pmu_register(struct pci_dev *pdev,
struct intel_uncore_type *type,
struct intel_uncore_pmu *pmu,
- int phys_id, int die)
+ int die)
{
struct intel_uncore_box *box;
int ret;
@@ -1070,7 +1061,6 @@ static int uncore_pci_pmu_register(struct pci_dev *pdev,
WARN_ON_ONCE(pmu->func_id != pdev->devfn);

atomic_inc(&box->refcnt);
- box->pci_phys_id = phys_id;
box->dieid = die;
box->pci_dev = pdev;
box->pmu = pmu;
@@ -1097,9 +1087,9 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
{
struct intel_uncore_type *type;
struct intel_uncore_pmu *pmu = NULL;
- int phys_id, die, ret;
+ int die, ret;

- ret = uncore_pci_get_dev_die_info(pdev, &phys_id, &die);
+ ret = uncore_pci_get_dev_die_info(pdev, &die);
if (ret)
return ret;

@@ -1132,7 +1122,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
}

- ret = uncore_pci_pmu_register(pdev, type, pmu, phys_id, die);
+ ret = uncore_pci_pmu_register(pdev, type, pmu, die);

pci_set_drvdata(pdev, pmu->boxes[die]);

@@ -1142,17 +1132,12 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
/*
* Unregister the PMU of a PCI device
* @pmu: The corresponding PMU is unregistered.
- * @phys_id: The physical socket id which the device maps to.
* @die: The die id which the device maps to.
*/
-static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu,
- int phys_id, int die)
+static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu, int die)
{
struct intel_uncore_box *box = pmu->boxes[die];

- if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
- return;
-
pmu->boxes[die] = NULL;
if (atomic_dec_return(&pmu->activeboxes) == 0)
uncore_pmu_unregister(pmu);
@@ -1164,9 +1149,9 @@ static void uncore_pci_remove(struct pci_dev *pdev)
{
struct intel_uncore_box *box;
struct intel_uncore_pmu *pmu;
- int i, phys_id, die;
+ int i, die;

- if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+ if (uncore_pci_get_dev_die_info(pdev, &die))
return;

box = pci_get_drvdata(pdev);
@@ -1185,7 +1170,7 @@ static void uncore_pci_remove(struct pci_dev *pdev)

pci_set_drvdata(pdev, NULL);

- uncore_pci_pmu_unregister(pmu, phys_id, die);
+ uncore_pci_pmu_unregister(pmu, die);
}

static int uncore_bus_notify(struct notifier_block *nb,
@@ -1194,7 +1179,7 @@ static int uncore_bus_notify(struct notifier_block *nb,
struct device *dev = data;
struct pci_dev *pdev = to_pci_dev(dev);
struct intel_uncore_pmu *pmu;
- int phys_id, die;
+ int die;

/* Unregister the PMU when the device is going to be deleted. */
if (action != BUS_NOTIFY_DEL_DEVICE)
@@ -1204,10 +1189,10 @@ static int uncore_bus_notify(struct notifier_block *nb,
if (!pmu)
return NOTIFY_DONE;

- if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+ if (uncore_pci_get_dev_die_info(pdev, &die))
return NOTIFY_DONE;

- uncore_pci_pmu_unregister(pmu, phys_id, die);
+ uncore_pci_pmu_unregister(pmu, die);

return NOTIFY_OK;
}
@@ -1224,7 +1209,7 @@ static void uncore_pci_sub_driver_init(void)
struct pci_dev *pci_sub_dev;
bool notify = false;
unsigned int devfn;
- int phys_id, die;
+ int die;

while (ids && ids->vendor) {
pci_sub_dev = NULL;
@@ -1244,12 +1229,11 @@ static void uncore_pci_sub_driver_init(void)
if (!pmu)
continue;

- if (uncore_pci_get_dev_die_info(pci_sub_dev,
- &phys_id, &die))
+ if (uncore_pci_get_dev_die_info(pci_sub_dev, &die))
continue;

if (!uncore_pci_pmu_register(pci_sub_dev, type, pmu,
- phys_id, die))
+ die))
notify = true;
}
ids++;
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 9efea154349d..a3c6e1643ad2 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -124,7 +124,6 @@ struct intel_uncore_extra_reg {
};

struct intel_uncore_box {
- int pci_phys_id;
int dieid; /* Logical die ID */
int n_active; /* number of active events */
int n_events;
@@ -173,11 +172,11 @@ struct freerunning_counters {
struct pci2phy_map {
struct list_head list;
int segment;
- int pbus_to_physid[256];
+ int pbus_to_dieid[256];
};

struct pci2phy_map *__find_pci2phy_map(int segment);
-int uncore_pcibus_to_physid(struct pci_bus *bus);
+int uncore_pcibus_to_dieid(struct pci_bus *bus);

ssize_t uncore_event_show(struct device *dev,
struct device_attribute *attr, char *buf);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 098f893e2e22..51271288499e 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -657,7 +657,7 @@ int snb_pci2phy_map_init(int devid)
pci_dev_put(dev);
return -ENOMEM;
}
- map->pbus_to_physid[bus] = 0;
+ map->pbus_to_dieid[bus] = 0;
raw_spin_unlock(&pci2phy_map_lock);

pci_dev_put(dev);
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 7bdb1821215d..2d7014dc46f6 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1359,7 +1359,7 @@ static struct pci_driver snbep_uncore_pci_driver = {
static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse)
{
struct pci_dev *ubox_dev = NULL;
- int i, bus, nodeid, segment;
+ int i, bus, nodeid, segment, die_id;
struct pci2phy_map *map;
int err = 0;
u32 config = 0;
@@ -1395,7 +1395,11 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
*/
for (i = 0; i < 8; i++) {
if (nodeid == ((config >> (3 * i)) & 0x7)) {
- map->pbus_to_physid[bus] = i;
+ if (topology_max_die_per_package() > 1)
+ die_id = i;
+ else
+ die_id = topology_phys_to_logical_pkg(i);
+ map->pbus_to_dieid[bus] = die_id;
break;
}
}
@@ -1412,17 +1416,17 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
i = -1;
if (reverse) {
for (bus = 255; bus >= 0; bus--) {
- if (map->pbus_to_physid[bus] >= 0)
- i = map->pbus_to_physid[bus];
+ if (map->pbus_to_dieid[bus] >= 0)
+ i = map->pbus_to_dieid[bus];
else
- map->pbus_to_physid[bus] = i;
+ map->pbus_to_dieid[bus] = i;
}
} else {
for (bus = 0; bus <= 255; bus++) {
- if (map->pbus_to_physid[bus] >= 0)
- i = map->pbus_to_physid[bus];
+ if (map->pbus_to_dieid[bus] >= 0)
+ i = map->pbus_to_dieid[bus];
else
- map->pbus_to_physid[bus] = i;
+ map->pbus_to_dieid[bus] = i;
}
}
}
@@ -4646,19 +4650,14 @@ int snr_uncore_pci_init(void)
static struct pci_dev *snr_uncore_get_mc_dev(int id)
{
struct pci_dev *mc_dev = NULL;
- int phys_id, pkg;
+ int pkg;

while (1) {
mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3451, mc_dev);
if (!mc_dev)
break;
- phys_id = uncore_pcibus_to_physid(mc_dev->bus);
- if (phys_id < 0)
- continue;
- pkg = topology_phys_to_logical_pkg(phys_id);
- if (pkg < 0)
- continue;
- else if (pkg == id)
+ pkg = uncore_pcibus_to_dieid(mc_dev->bus);
+ if (pkg == id)
break;
}
return mc_dev;
--
2.26.2

2021-01-08 15:39:01

by Steve Wahl

[permalink] [raw]
Subject: [PATCH 2/2] perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA info

The registers used to determine which die a pci bus belongs to don't
contain enough information to uniquely specify more than 8 dies, so
when more than 8 dies are present, use NUMA information instead.

Continue to use the previous method for 8 or fewer because it
works there, and covers cases of NUMA being disabled.

Signed-off-by: Steve Wahl <[email protected]>
---
arch/x86/events/intel/uncore_snbep.c | 93 +++++++++++++++++++---------
1 file changed, 65 insertions(+), 28 deletions(-)

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 2d7014dc46f6..b79951d0707c 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1370,40 +1370,77 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
if (!ubox_dev)
break;
bus = ubox_dev->bus->number;
- /* get the Node ID of the local register */
- err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
- if (err)
- break;
- nodeid = config & NODE_ID_MASK;
- /* get the Node ID mapping */
- err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
- if (err)
- break;
+ /*
+ * The nodeid and idmap registers only contain enough
+ * information to handle 8 nodes. On systems with more
+ * than 8 nodes, we need to rely on NUMA information,
+ * filled in from BIOS supplied information, to determine
+ * the topology.
+ */
+ if (nr_node_ids <= 8) {
+ /* get the Node ID of the local register */
+ err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
+ if (err)
+ break;
+ nodeid = config & NODE_ID_MASK;
+ /* get the Node ID mapping */
+ err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
+ if (err)
+ break;

- segment = pci_domain_nr(ubox_dev->bus);
- raw_spin_lock(&pci2phy_map_lock);
- map = __find_pci2phy_map(segment);
- if (!map) {
+ segment = pci_domain_nr(ubox_dev->bus);
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ err = -ENOMEM;
+ break;
+ }
+
+ /*
+ * every three bits in the Node ID mapping register maps
+ * to a particular node.
+ */
+ for (i = 0; i < 8; i++) {
+ if (nodeid == ((config >> (3 * i)) & 0x7)) {
+ if (topology_max_die_per_package() > 1)
+ die_id = i;
+ else
+ die_id = topology_phys_to_logical_pkg(i);
+ map->pbus_to_dieid[bus] = die_id;
+ break;
+ }
+ }
raw_spin_unlock(&pci2phy_map_lock);
- err = -ENOMEM;
- break;
- }
+ } else {
+ int node = pcibus_to_node(ubox_dev->bus);
+ int cpu;
+
+ segment = pci_domain_nr(ubox_dev->bus);
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ err = -ENOMEM;
+ break;
+ }

- /*
- * every three bits in the Node ID mapping register maps
- * to a particular node.
- */
- for (i = 0; i < 8; i++) {
- if (nodeid == ((config >> (3 * i)) & 0x7)) {
- if (topology_max_die_per_package() > 1)
- die_id = i;
- else
- die_id = topology_phys_to_logical_pkg(i);
- map->pbus_to_dieid[bus] = die_id;
+ die_id = -1;
+ for_each_cpu(cpu, cpumask_of_pcibus(ubox_dev->bus)) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->initialized && cpu_to_node(cpu) == node) {
+ map->pbus_to_dieid[bus] = die_id = c->logical_die_id;
+ break;
+ }
+ }
+ raw_spin_unlock(&pci2phy_map_lock);
+
+ if (WARN_ON_ONCE(die_id == -1)) {
+ err = -EINVAL;
break;
}
}
- raw_spin_unlock(&pci2phy_map_lock);
}

if (!err) {
--
2.26.2

2021-01-11 15:49:30

by Steve Wahl

[permalink] [raw]
Subject: Re: [PATCH 2/2] perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA info

On Mon, Jan 11, 2021 at 02:00:33PM +0100, Peter Zijlstra wrote:
> On Fri, Jan 08, 2021 at 09:35:49AM -0600, Steve Wahl wrote:
>
>
> > + /*
> > + * The nodeid and idmap registers only contain enough
> > + * information to handle 8 nodes. On systems with more
> > + * than 8 nodes, we need to rely on NUMA information,
> > + * filled in from BIOS supplied information, to determine
> > + * the topology.
> > + */
>
> Egads.. do we realy have to trust BIOS data? BIOS crud tends to be
> bonghits qualitee :/

I work too close to BIOS people (virtually, at least for the moment)
to safely make disparaging remarks. :-) While the origin is the BIOS,
I'm using pieces that were already being pulled from the BIOS tables
for NUMA purposes.

> > + if (nr_node_ids <= 8) {
> > + /* get the Node ID of the local register */
> > + err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
> > + if (err)
> > + break;
> > + nodeid = config & NODE_ID_MASK;
> > + /* get the Node ID mapping */
> > + err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
> > + if (err)
> > + break;
> >
> > + segment = pci_domain_nr(ubox_dev->bus);
> > + raw_spin_lock(&pci2phy_map_lock);
> > + map = __find_pci2phy_map(segment);
> > + if (!map) {
> > + raw_spin_unlock(&pci2phy_map_lock);
> > + err = -ENOMEM;
> > + break;
> > + }
> > +
> > + /*
> > + * every three bits in the Node ID mapping register maps
> > + * to a particular node.
> > + */
> > + for (i = 0; i < 8; i++) {
> > + if (nodeid == ((config >> (3 * i)) & 0x7)) {
> > + if (topology_max_die_per_package() > 1)
> > + die_id = i;
> > + else
> > + die_id = topology_phys_to_logical_pkg(i);
> > + map->pbus_to_dieid[bus] = die_id;
> > + break;
> > + }
> > + }
> > raw_spin_unlock(&pci2phy_map_lock);
> > + } else {
> > + int node = pcibus_to_node(ubox_dev->bus);
> > + int cpu;
> > +
> > + segment = pci_domain_nr(ubox_dev->bus);
> > + raw_spin_lock(&pci2phy_map_lock);
> > + map = __find_pci2phy_map(segment);
> > + if (!map) {
> > + raw_spin_unlock(&pci2phy_map_lock);
> > + err = -ENOMEM;
> > + break;
> > + }
> > + die_id = -1;
> > + for_each_cpu(cpu, cpumask_of_pcibus(ubox_dev->bus)) {
> > + struct cpuinfo_x86 *c = &cpu_data(cpu);
> > +
> > + if (c->initialized && cpu_to_node(cpu) == node) {
> > + map->pbus_to_dieid[bus] = die_id = c->logical_die_id;
> > + break;
> > + }
> > + }
> > + raw_spin_unlock(&pci2phy_map_lock);
> > +
> > + if (WARN_ON_ONCE(die_id == -1)) {
> > + err = -EINVAL;
> > break;
> > }
>
> This seems to assume a single die per node; is that fundemantally
> correct?

It should work for one or more nodes per die; i.e. sub-NUMA clustering
should work. If there are any systems with fewer nodes than dies
(more than one die in a NUMA node) it will likely fail. It's not
clear to me whether nodes < dies is a possibility or not; however,
note that this situation would be broken with or without my changes.

> Did you consider malicious BIOS data? I think we're good, but I didn't
> look too hard.

I did not consider malicious BIOS data. With quick thought toward it,
I believe the worst that could happen is the counters get associated
with the wrong die, and only under circumstances where the previous
code would have aborted mapping the counters to dies completely (which
it does when there are more than 8 dies).

Thank you for taking the time to look at this!

--> Steve

--
Steve Wahl, Hewlett Packard Enterprise

2021-01-12 00:45:43

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/2] perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA info

On Fri, Jan 08, 2021 at 09:35:49AM -0600, Steve Wahl wrote:


> + /*
> + * The nodeid and idmap registers only contain enough
> + * information to handle 8 nodes. On systems with more
> + * than 8 nodes, we need to rely on NUMA information,
> + * filled in from BIOS supplied information, to determine
> + * the topology.
> + */

Egads.. do we realy have to trust BIOS data? BIOS crud tends to be
bonghits qualitee :/

> + if (nr_node_ids <= 8) {
> + /* get the Node ID of the local register */
> + err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
> + if (err)
> + break;
> + nodeid = config & NODE_ID_MASK;
> + /* get the Node ID mapping */
> + err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
> + if (err)
> + break;
>
> + segment = pci_domain_nr(ubox_dev->bus);
> + raw_spin_lock(&pci2phy_map_lock);
> + map = __find_pci2phy_map(segment);
> + if (!map) {
> + raw_spin_unlock(&pci2phy_map_lock);
> + err = -ENOMEM;
> + break;
> + }
> +
> + /*
> + * every three bits in the Node ID mapping register maps
> + * to a particular node.
> + */
> + for (i = 0; i < 8; i++) {
> + if (nodeid == ((config >> (3 * i)) & 0x7)) {
> + if (topology_max_die_per_package() > 1)
> + die_id = i;
> + else
> + die_id = topology_phys_to_logical_pkg(i);
> + map->pbus_to_dieid[bus] = die_id;
> + break;
> + }
> + }
> raw_spin_unlock(&pci2phy_map_lock);
> + } else {
> + int node = pcibus_to_node(ubox_dev->bus);
> + int cpu;
> +
> + segment = pci_domain_nr(ubox_dev->bus);
> + raw_spin_lock(&pci2phy_map_lock);
> + map = __find_pci2phy_map(segment);
> + if (!map) {
> + raw_spin_unlock(&pci2phy_map_lock);
> + err = -ENOMEM;
> + break;
> + }
> + die_id = -1;
> + for_each_cpu(cpu, cpumask_of_pcibus(ubox_dev->bus)) {
> + struct cpuinfo_x86 *c = &cpu_data(cpu);
> +
> + if (c->initialized && cpu_to_node(cpu) == node) {
> + map->pbus_to_dieid[bus] = die_id = c->logical_die_id;
> + break;
> + }
> + }
> + raw_spin_unlock(&pci2phy_map_lock);
> +
> + if (WARN_ON_ONCE(die_id == -1)) {
> + err = -EINVAL;
> break;
> }

This seems to assume a single die per node; is that fundemantally
correct?

Did you consider malicious BIOS data? I think we're good, but I didn't
look too hard.

> }
> }
>
> if (!err) {
> --
> 2.26.2
>

2021-01-12 08:52:24

by Liang, Kan

[permalink] [raw]
Subject: Re: [PATCH 0/2] perf/x86/intel/uncore: Derive die id from NUMA info with more than 8 nodes



On 1/8/2021 10:35 AM, Steve Wahl wrote:
> For Intel uncore, the registers being used to identify the die don't
> contain enough bits to uniquely identify more than 8 dies. On
> systems with more than 8 dies, this results in error messages of the
> form "skx_uncore: probe of XXXX:XX:XX.X failed with error -22", and
> some perf counters showing up as "<not supported>".
>
> On such systems, use NUMA information to determine die id.
>
> Continue to use the register information with 8 or fewer numa nodes to
> cover cases like NUMA not being enabled.
>
> The first patch moves translation from physical to logical die id
> earlier in the code, and stores only the logical id. The logical id
> is the only one that is really used. Without this change the second
> patch would have to store both physical and logical id, which was much
> more complicated.
>
> The second patch adds the alternative of deriving the logical die id
> from the NUMA information when there are more than 8 nodes.
>
> Steve Wahl (2):
> perf/x86/intel/uncore: Store the logical die id instead of the
> physical die id.
> perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA
> info
>
> arch/x86/events/intel/uncore.c | 58 +++++---------
> arch/x86/events/intel/uncore.h | 5 +-
> arch/x86/events/intel/uncore_snb.c | 2 +-
> arch/x86/events/intel/uncore_snbep.c | 114 ++++++++++++++++++---------
> 4 files changed, 99 insertions(+), 80 deletions(-)
>

Thanks Steve for working on the issue. The patch set looks good to me.

Reviewed-by: Kan Liang <[email protected]>

Thanks,
Kan

2021-01-12 15:10:41

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/2] perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA info

On Mon, Jan 11, 2021 at 09:45:16AM -0600, Steve Wahl wrote:
> On Mon, Jan 11, 2021 at 02:00:33PM +0100, Peter Zijlstra wrote:
> > On Fri, Jan 08, 2021 at 09:35:49AM -0600, Steve Wahl wrote:
> >
> >
> > > + /*
> > > + * The nodeid and idmap registers only contain enough
> > > + * information to handle 8 nodes. On systems with more
> > > + * than 8 nodes, we need to rely on NUMA information,
> > > + * filled in from BIOS supplied information, to determine
> > > + * the topology.
> > > + */
> >
> > Egads.. do we realy have to trust BIOS data? BIOS crud tends to be
> > bonghits qualitee :/
>
> I work too close to BIOS people (virtually, at least for the moment)
> to safely make disparaging remarks. :-) While the origin is the BIOS,
> I'm using pieces that were already being pulled from the BIOS tables
> for NUMA purposes.

:-) It's just that we've had too much 'fun' with PCI node bindings in
the past.



2021-01-12 19:47:04

by Steve Wahl

[permalink] [raw]
Subject: Re: [PATCH 2/2] perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA info

On Tue, Jan 12, 2021 at 04:07:15PM +0100, Peter Zijlstra wrote:
> On Mon, Jan 11, 2021 at 09:45:16AM -0600, Steve Wahl wrote:
> > On Mon, Jan 11, 2021 at 02:00:33PM +0100, Peter Zijlstra wrote:
> > > On Fri, Jan 08, 2021 at 09:35:49AM -0600, Steve Wahl wrote:
> > >
> > >
> > > > + /*
> > > > + * The nodeid and idmap registers only contain enough
> > > > + * information to handle 8 nodes. On systems with more
> > > > + * than 8 nodes, we need to rely on NUMA information,
> > > > + * filled in from BIOS supplied information, to determine
> > > > + * the topology.
> > > > + */
> > >
> > > Egads.. do we realy have to trust BIOS data? BIOS crud tends to be
> > > bonghits qualitee :/
> >
> > I work too close to BIOS people (virtually, at least for the moment)
> > to safely make disparaging remarks. :-) While the origin is the BIOS,
> > I'm using pieces that were already being pulled from the BIOS tables
> > for NUMA purposes.
>
> :-) It's just that we've had too much 'fun' with PCI node bindings in
> the past.

I wasn't aware of that, but I understand. Fortunately, this patch
should't touch cases that aren't already broken (>8 nodes); working
cases continue to use the existing methods.

Thanks!

--> Steve Wahl

--
Steve Wahl, Hewlett Packard Enterprise

Subject: [tip: perf/core] perf/x86/intel/uncore: Store the logical die id instead of the physical die id.

The following commit has been merged into the perf/core branch of tip:

Commit-ID: ba9506be4e402ee597b8f41204008b97989b5eef
Gitweb: https://git.kernel.org/tip/ba9506be4e402ee597b8f41204008b97989b5eef
Author: Steve Wahl <[email protected]>
AuthorDate: Fri, 08 Jan 2021 09:35:48 -06:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Thu, 14 Jan 2021 11:20:13 +01:00

perf/x86/intel/uncore: Store the logical die id instead of the physical die id.

The phys_id isn't really used other than to map to a logical die id.
Calculate the logical die id earlier, and store that instead of the
phys_id.

Signed-off-by: Steve Wahl <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Kan Liang <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
arch/x86/events/intel/uncore.c | 58 +++++++++------------------
arch/x86/events/intel/uncore.h | 5 +--
arch/x86/events/intel/uncore_snb.c | 2 +-
arch/x86/events/intel/uncore_snbep.c | 31 ++++++--------
4 files changed, 39 insertions(+), 57 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 357258f..33c8180 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -31,21 +31,21 @@ struct event_constraint uncore_constraint_empty =

MODULE_LICENSE("GPL");

-int uncore_pcibus_to_physid(struct pci_bus *bus)
+int uncore_pcibus_to_dieid(struct pci_bus *bus)
{
struct pci2phy_map *map;
- int phys_id = -1;
+ int die_id = -1;

raw_spin_lock(&pci2phy_map_lock);
list_for_each_entry(map, &pci2phy_map_head, list) {
if (map->segment == pci_domain_nr(bus)) {
- phys_id = map->pbus_to_physid[bus->number];
+ die_id = map->pbus_to_dieid[bus->number];
break;
}
}
raw_spin_unlock(&pci2phy_map_lock);

- return phys_id;
+ return die_id;
}

static void uncore_free_pcibus_map(void)
@@ -86,7 +86,7 @@ lookup:
alloc = NULL;
map->segment = segment;
for (i = 0; i < 256; i++)
- map->pbus_to_physid[i] = -1;
+ map->pbus_to_dieid[i] = -1;
list_add_tail(&map->list, &pci2phy_map_head);

end:
@@ -332,7 +332,6 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,

uncore_pmu_init_hrtimer(box);
box->cpu = -1;
- box->pci_phys_id = -1;
box->dieid = -1;

/* set default hrtimer timeout */
@@ -993,18 +992,11 @@ uncore_types_init(struct intel_uncore_type **types, bool setid)
/*
* Get the die information of a PCI device.
* @pdev: The PCI device.
- * @phys_id: The physical socket id which the device maps to.
* @die: The die id which the device maps to.
*/
-static int uncore_pci_get_dev_die_info(struct pci_dev *pdev,
- int *phys_id, int *die)
+static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, int *die)
{
- *phys_id = uncore_pcibus_to_physid(pdev->bus);
- if (*phys_id < 0)
- return -ENODEV;
-
- *die = (topology_max_die_per_package() > 1) ? *phys_id :
- topology_phys_to_logical_pkg(*phys_id);
+ *die = uncore_pcibus_to_dieid(pdev->bus);
if (*die < 0)
return -EINVAL;

@@ -1046,13 +1038,12 @@ uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
* @pdev: The PCI device.
* @type: The corresponding PMU type of the device.
* @pmu: The corresponding PMU of the device.
- * @phys_id: The physical socket id which the device maps to.
* @die: The die id which the device maps to.
*/
static int uncore_pci_pmu_register(struct pci_dev *pdev,
struct intel_uncore_type *type,
struct intel_uncore_pmu *pmu,
- int phys_id, int die)
+ int die)
{
struct intel_uncore_box *box;
int ret;
@@ -1070,7 +1061,6 @@ static int uncore_pci_pmu_register(struct pci_dev *pdev,
WARN_ON_ONCE(pmu->func_id != pdev->devfn);

atomic_inc(&box->refcnt);
- box->pci_phys_id = phys_id;
box->dieid = die;
box->pci_dev = pdev;
box->pmu = pmu;
@@ -1097,9 +1087,9 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
{
struct intel_uncore_type *type;
struct intel_uncore_pmu *pmu = NULL;
- int phys_id, die, ret;
+ int die, ret;

- ret = uncore_pci_get_dev_die_info(pdev, &phys_id, &die);
+ ret = uncore_pci_get_dev_die_info(pdev, &die);
if (ret)
return ret;

@@ -1132,7 +1122,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
}

- ret = uncore_pci_pmu_register(pdev, type, pmu, phys_id, die);
+ ret = uncore_pci_pmu_register(pdev, type, pmu, die);

pci_set_drvdata(pdev, pmu->boxes[die]);

@@ -1142,17 +1132,12 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
/*
* Unregister the PMU of a PCI device
* @pmu: The corresponding PMU is unregistered.
- * @phys_id: The physical socket id which the device maps to.
* @die: The die id which the device maps to.
*/
-static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu,
- int phys_id, int die)
+static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu, int die)
{
struct intel_uncore_box *box = pmu->boxes[die];

- if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
- return;
-
pmu->boxes[die] = NULL;
if (atomic_dec_return(&pmu->activeboxes) == 0)
uncore_pmu_unregister(pmu);
@@ -1164,9 +1149,9 @@ static void uncore_pci_remove(struct pci_dev *pdev)
{
struct intel_uncore_box *box;
struct intel_uncore_pmu *pmu;
- int i, phys_id, die;
+ int i, die;

- if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+ if (uncore_pci_get_dev_die_info(pdev, &die))
return;

box = pci_get_drvdata(pdev);
@@ -1185,7 +1170,7 @@ static void uncore_pci_remove(struct pci_dev *pdev)

pci_set_drvdata(pdev, NULL);

- uncore_pci_pmu_unregister(pmu, phys_id, die);
+ uncore_pci_pmu_unregister(pmu, die);
}

static int uncore_bus_notify(struct notifier_block *nb,
@@ -1194,7 +1179,7 @@ static int uncore_bus_notify(struct notifier_block *nb,
struct device *dev = data;
struct pci_dev *pdev = to_pci_dev(dev);
struct intel_uncore_pmu *pmu;
- int phys_id, die;
+ int die;

/* Unregister the PMU when the device is going to be deleted. */
if (action != BUS_NOTIFY_DEL_DEVICE)
@@ -1204,10 +1189,10 @@ static int uncore_bus_notify(struct notifier_block *nb,
if (!pmu)
return NOTIFY_DONE;

- if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+ if (uncore_pci_get_dev_die_info(pdev, &die))
return NOTIFY_DONE;

- uncore_pci_pmu_unregister(pmu, phys_id, die);
+ uncore_pci_pmu_unregister(pmu, die);

return NOTIFY_OK;
}
@@ -1224,7 +1209,7 @@ static void uncore_pci_sub_driver_init(void)
struct pci_dev *pci_sub_dev;
bool notify = false;
unsigned int devfn;
- int phys_id, die;
+ int die;

while (ids && ids->vendor) {
pci_sub_dev = NULL;
@@ -1244,12 +1229,11 @@ static void uncore_pci_sub_driver_init(void)
if (!pmu)
continue;

- if (uncore_pci_get_dev_die_info(pci_sub_dev,
- &phys_id, &die))
+ if (uncore_pci_get_dev_die_info(pci_sub_dev, &die))
continue;

if (!uncore_pci_pmu_register(pci_sub_dev, type, pmu,
- phys_id, die))
+ die))
notify = true;
}
ids++;
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 9efea15..a3c6e16 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -124,7 +124,6 @@ struct intel_uncore_extra_reg {
};

struct intel_uncore_box {
- int pci_phys_id;
int dieid; /* Logical die ID */
int n_active; /* number of active events */
int n_events;
@@ -173,11 +172,11 @@ struct freerunning_counters {
struct pci2phy_map {
struct list_head list;
int segment;
- int pbus_to_physid[256];
+ int pbus_to_dieid[256];
};

struct pci2phy_map *__find_pci2phy_map(int segment);
-int uncore_pcibus_to_physid(struct pci_bus *bus);
+int uncore_pcibus_to_dieid(struct pci_bus *bus);

ssize_t uncore_event_show(struct device *dev,
struct device_attribute *attr, char *buf);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 098f893..5127128 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -657,7 +657,7 @@ int snb_pci2phy_map_init(int devid)
pci_dev_put(dev);
return -ENOMEM;
}
- map->pbus_to_physid[bus] = 0;
+ map->pbus_to_dieid[bus] = 0;
raw_spin_unlock(&pci2phy_map_lock);

pci_dev_put(dev);
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 7bdb182..2d7014d 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1359,7 +1359,7 @@ static struct pci_driver snbep_uncore_pci_driver = {
static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse)
{
struct pci_dev *ubox_dev = NULL;
- int i, bus, nodeid, segment;
+ int i, bus, nodeid, segment, die_id;
struct pci2phy_map *map;
int err = 0;
u32 config = 0;
@@ -1395,7 +1395,11 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
*/
for (i = 0; i < 8; i++) {
if (nodeid == ((config >> (3 * i)) & 0x7)) {
- map->pbus_to_physid[bus] = i;
+ if (topology_max_die_per_package() > 1)
+ die_id = i;
+ else
+ die_id = topology_phys_to_logical_pkg(i);
+ map->pbus_to_dieid[bus] = die_id;
break;
}
}
@@ -1412,17 +1416,17 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
i = -1;
if (reverse) {
for (bus = 255; bus >= 0; bus--) {
- if (map->pbus_to_physid[bus] >= 0)
- i = map->pbus_to_physid[bus];
+ if (map->pbus_to_dieid[bus] >= 0)
+ i = map->pbus_to_dieid[bus];
else
- map->pbus_to_physid[bus] = i;
+ map->pbus_to_dieid[bus] = i;
}
} else {
for (bus = 0; bus <= 255; bus++) {
- if (map->pbus_to_physid[bus] >= 0)
- i = map->pbus_to_physid[bus];
+ if (map->pbus_to_dieid[bus] >= 0)
+ i = map->pbus_to_dieid[bus];
else
- map->pbus_to_physid[bus] = i;
+ map->pbus_to_dieid[bus] = i;
}
}
}
@@ -4646,19 +4650,14 @@ int snr_uncore_pci_init(void)
static struct pci_dev *snr_uncore_get_mc_dev(int id)
{
struct pci_dev *mc_dev = NULL;
- int phys_id, pkg;
+ int pkg;

while (1) {
mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3451, mc_dev);
if (!mc_dev)
break;
- phys_id = uncore_pcibus_to_physid(mc_dev->bus);
- if (phys_id < 0)
- continue;
- pkg = topology_phys_to_logical_pkg(phys_id);
- if (pkg < 0)
- continue;
- else if (pkg == id)
+ pkg = uncore_pcibus_to_dieid(mc_dev->bus);
+ if (pkg == id)
break;
}
return mc_dev;

Subject: [tip: perf/core] perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA info

The following commit has been merged into the perf/core branch of tip:

Commit-ID: 9a7832ce3d920426a36cdd78eda4b3568d4d09e3
Gitweb: https://git.kernel.org/tip/9a7832ce3d920426a36cdd78eda4b3568d4d09e3
Author: Steve Wahl <[email protected]>
AuthorDate: Fri, 08 Jan 2021 09:35:49 -06:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Thu, 14 Jan 2021 11:20:14 +01:00

perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA info

The registers used to determine which die a pci bus belongs to don't
contain enough information to uniquely specify more than 8 dies, so
when more than 8 dies are present, use NUMA information instead.

Continue to use the previous method for 8 or fewer because it
works there, and covers cases of NUMA being disabled.

Signed-off-by: Steve Wahl <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Kan Liang <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
arch/x86/events/intel/uncore_snbep.c | 93 ++++++++++++++++++---------
1 file changed, 65 insertions(+), 28 deletions(-)

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 2d7014d..b79951d 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1370,40 +1370,77 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
if (!ubox_dev)
break;
bus = ubox_dev->bus->number;
- /* get the Node ID of the local register */
- err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
- if (err)
- break;
- nodeid = config & NODE_ID_MASK;
- /* get the Node ID mapping */
- err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
- if (err)
- break;
+ /*
+ * The nodeid and idmap registers only contain enough
+ * information to handle 8 nodes. On systems with more
+ * than 8 nodes, we need to rely on NUMA information,
+ * filled in from BIOS supplied information, to determine
+ * the topology.
+ */
+ if (nr_node_ids <= 8) {
+ /* get the Node ID of the local register */
+ err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
+ if (err)
+ break;
+ nodeid = config & NODE_ID_MASK;
+ /* get the Node ID mapping */
+ err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
+ if (err)
+ break;

- segment = pci_domain_nr(ubox_dev->bus);
- raw_spin_lock(&pci2phy_map_lock);
- map = __find_pci2phy_map(segment);
- if (!map) {
+ segment = pci_domain_nr(ubox_dev->bus);
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ err = -ENOMEM;
+ break;
+ }
+
+ /*
+ * every three bits in the Node ID mapping register maps
+ * to a particular node.
+ */
+ for (i = 0; i < 8; i++) {
+ if (nodeid == ((config >> (3 * i)) & 0x7)) {
+ if (topology_max_die_per_package() > 1)
+ die_id = i;
+ else
+ die_id = topology_phys_to_logical_pkg(i);
+ map->pbus_to_dieid[bus] = die_id;
+ break;
+ }
+ }
raw_spin_unlock(&pci2phy_map_lock);
- err = -ENOMEM;
- break;
- }
+ } else {
+ int node = pcibus_to_node(ubox_dev->bus);
+ int cpu;
+
+ segment = pci_domain_nr(ubox_dev->bus);
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ err = -ENOMEM;
+ break;
+ }

- /*
- * every three bits in the Node ID mapping register maps
- * to a particular node.
- */
- for (i = 0; i < 8; i++) {
- if (nodeid == ((config >> (3 * i)) & 0x7)) {
- if (topology_max_die_per_package() > 1)
- die_id = i;
- else
- die_id = topology_phys_to_logical_pkg(i);
- map->pbus_to_dieid[bus] = die_id;
+ die_id = -1;
+ for_each_cpu(cpu, cpumask_of_pcibus(ubox_dev->bus)) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->initialized && cpu_to_node(cpu) == node) {
+ map->pbus_to_dieid[bus] = die_id = c->logical_die_id;
+ break;
+ }
+ }
+ raw_spin_unlock(&pci2phy_map_lock);
+
+ if (WARN_ON_ONCE(die_id == -1)) {
+ err = -EINVAL;
break;
}
}
- raw_spin_unlock(&pci2phy_map_lock);
}

if (!err) {