2015-07-01 00:44:25

by Izumi, Taku

[permalink] [raw]
Subject: [PATCH] perf: Fix multi-segment problem of perf_event_intel_uncore

In multi-segment system, uncore devices may belong to buses whose segment
number is other than 0.

....
0000:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 Scratchpad & Semaphore Registers (rev 03)
...
0001:7f:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 Scratchpad & Semaphore Registers (rev 03)
...
0001:bf:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 Scratchpad & Semaphore Registers (rev 03)
...
0001:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 Scratchpad & Semaphore Registers (rev 03
...

In that case relation of bus number and physical id may be broken
because "uncore_pcibus_to_physid" doesn't take account of PCI segment.
For example, bus 0000:ff and 0001:ff uses the same entry of
"uncore_pcibus_to_physid" array.

This patch fixes ths problem by introducing segment-aware pci2phy_map instead.

Signed-off-by: Taku Izumi <[email protected]>
---
arch/x86/kernel/cpu/perf_event_intel_uncore.c | 27 +++++++++++---
arch/x86/kernel/cpu/perf_event_intel_uncore.h | 9 ++++-
arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 23 +++++++++++-
.../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 41 ++++++++++++++++++----
4 files changed, 87 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 21b5e38..78c8686 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
static bool pcidrv_registered;
struct pci_driver *uncore_pci_driver;
/* pci bus to socket mapping */
-int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];

static DEFINE_RAW_SPINLOCK(uncore_box_lock);
@@ -806,10 +807,18 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
struct intel_uncore_pmu *pmu;
struct intel_uncore_box *box;
struct intel_uncore_type *type;
- int phys_id;
+ int phys_id = -1;
bool first_box = false;
+ struct pci2phy_map *map;

- phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+ raw_spin_lock(&pci2phy_map_lock);
+ list_for_each_entry(map, &pci2phy_map_head, list) {
+ if (map->segment == pci_domain_nr(pdev->bus)) {
+ phys_id = map->pbus_to_physid[pdev->bus->number];
+ break;
+ }
+ }
+ raw_spin_unlock(&pci2phy_map_lock);
if (phys_id < 0)
return -ENODEV;

@@ -856,8 +865,18 @@ static void uncore_pci_remove(struct pci_dev *pdev)
{
struct intel_uncore_box *box = pci_get_drvdata(pdev);
struct intel_uncore_pmu *pmu;
- int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+ int i, cpu, phys_id = -1;
bool last_box = false;
+ struct pci2phy_map *map;
+
+ raw_spin_lock(&pci2phy_map_lock);
+ list_for_each_entry(map, &pci2phy_map_head, list) {
+ if (map->segment == pci_domain_nr(pdev->bus)) {
+ phys_id = map->pbus_to_physid[pdev->bus->number];
+ break;
+ }
+ }
+ raw_spin_unlock(&pci2phy_map_lock);

box = pci_get_drvdata(pdev);
if (!box) {
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 0f77f0a..0fb2a23 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -117,6 +117,12 @@ struct uncore_event_desc {
const char *config;
};

+struct pci2phy_map {
+ struct list_head list;
+ int segment;
+ int pbus_to_physid[256];
+};
+
ssize_t uncore_event_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf);

@@ -317,7 +323,8 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
extern struct intel_uncore_type **uncore_msr_uncores;
extern struct intel_uncore_type **uncore_pci_uncores;
extern struct pci_driver *uncore_pci_driver;
-extern int uncore_pcibus_to_physid[256];
+extern raw_spinlock_t pci2phy_map_lock;
+extern struct list_head pci2phy_map_head;
extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
extern struct event_constraint uncore_constraint_empty;

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index b005a78..ccbc817 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -402,14 +402,35 @@ static int snb_pci2phy_map_init(int devid)
{
struct pci_dev *dev = NULL;
int bus;
+ int segment;
+ struct pci2phy_map *map;
+ bool found = false;

dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
if (!dev)
return -ENOTTY;

bus = dev->bus->number;
+ segment = pci_domain_nr(dev->bus);

- uncore_pcibus_to_physid[bus] = 0;
+ raw_spin_lock(&pci2phy_map_lock);
+ list_for_each_entry(map, &pci2phy_map_head, list) {
+ if (map->segment == segment) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ map = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
+ if (map) {
+ map->segment = segment;
+ map->pbus_to_physid[bus] = 0;
+ list_add_tail(&map->list, &pci2phy_map_head);
+ }
+ } else {
+ map->pbus_to_physid[bus] = 0;
+ }
+ raw_spin_unlock(&pci2phy_map_lock);

pci_dev_put(dev);

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 6d6e85d..85ae74b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -1090,6 +1090,9 @@ static int snbep_pci2phy_map_init(int devid)
int i, bus, nodeid;
int err = 0;
u32 config = 0;
+ struct pci2phy_map *map;
+ int segment;
+ bool found = false;

while (1) {
/* find the UBOX device */
@@ -1106,13 +1109,33 @@ static int snbep_pci2phy_map_init(int devid)
err = pci_read_config_dword(ubox_dev, 0x54, &config);
if (err)
break;
+
+ segment = pci_domain_nr(ubox_dev->bus);
+ raw_spin_lock(&pci2phy_map_lock);
+ list_for_each_entry(map, &pci2phy_map_head, list) {
+ if (map->segment == segment) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ map = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
+ if (!map) {
+ err = -ENOMEM;
+ break;
+ }
+ map->segment = segment;
+ list_add_tail(&map->list, &pci2phy_map_head);
+ }
+ raw_spin_unlock(&pci2phy_map_lock);
+
/*
* every three bits in the Node ID mapping register maps
* to a particular node.
*/
for (i = 0; i < 8; i++) {
if (nodeid == ((config >> (3 * i)) & 0x7)) {
- uncore_pcibus_to_physid[bus] = i;
+ map->pbus_to_physid[bus] = i;
break;
}
}
@@ -1123,13 +1146,17 @@ static int snbep_pci2phy_map_init(int devid)
* For PCI bus with no UBOX device, find the next bus
* that has UBOX device and use its mapping.
*/
- i = -1;
- for (bus = 255; bus >= 0; bus--) {
- if (uncore_pcibus_to_physid[bus] >= 0)
- i = uncore_pcibus_to_physid[bus];
- else
- uncore_pcibus_to_physid[bus] = i;
+ raw_spin_lock(&pci2phy_map_lock);
+ list_for_each_entry(map, &pci2phy_map_head, list) {
+ i = -1;
+ for (bus = 255; bus >= 0; bus--) {
+ if (map->pbus_to_physid[bus] >= 0)
+ i = map->pbus_to_physid[bus];
+ else
+ map->pbus_to_physid[bus] = i;
+ }
}
+ raw_spin_unlock(&pci2phy_map_lock);
}

pci_dev_put(ubox_dev);
--
1.8.3.1


2015-07-02 11:17:32

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] perf: Fix multi-segment problem of perf_event_intel_uncore

On Thu, Jul 02, 2015 at 05:01:40AM +0900, Taku Izumi wrote:
> + raw_spin_lock(&pci2phy_map_lock);
> + list_for_each_entry(map, &pci2phy_map_head, list) {
> + if (map->segment == pci_domain_nr(pdev->bus)) {
> + phys_id = map->pbus_to_physid[pdev->bus->number];
> + break;
> + }
> + }
> + raw_spin_unlock(&pci2phy_map_lock);

> + raw_spin_lock(&pci2phy_map_lock);
> + list_for_each_entry(map, &pci2phy_map_head, list) {
> + if (map->segment == pci_domain_nr(pdev->bus)) {
> + phys_id = map->pbus_to_physid[pdev->bus->number];
> + break;
> + }
> + }
> + raw_spin_unlock(&pci2phy_map_lock);

> + raw_spin_lock(&pci2phy_map_lock);
> + list_for_each_entry(map, &pci2phy_map_head, list) {
> + if (map->segment == segment) {
> + found = true;
> + break;
> + }
> + }
> + if (!found) {
> + map = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
> + if (map) {
> + map->segment = segment;
> + map->pbus_to_physid[bus] = 0;
> + list_add_tail(&map->list, &pci2phy_map_head);
> + }
> + } else {
> + map->pbus_to_physid[bus] = 0;
> + }
> + raw_spin_unlock(&pci2phy_map_lock);

> + raw_spin_lock(&pci2phy_map_lock);
> + list_for_each_entry(map, &pci2phy_map_head, list) {
> + if (map->segment == segment) {
> + found = true;
> + break;
> + }
> + }
> + if (!found) {
> + map = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
> + if (!map) {
> + err = -ENOMEM;
> + break;
> + }
> + map->segment = segment;
> + list_add_tail(&map->list, &pci2phy_map_head);
> + }
> + raw_spin_unlock(&pci2phy_map_lock);

You'd think they invent something to avoid repetitions like that.. Oh
wait..