Since NVDIMMs are installed on memory slots, they expose the NUMA
topology of a platform. This patchset adds support of sysfs
'numa_node' to I/O-related NVDIMM devices under /sys/bus/nd/devices.
This enables numactl(8) to accept 'block:' and 'file:' paths of pmem
and btt devices as shown in the examples below.
numactl --preferred block:pmem0 --show
numactl --preferred file:/dev/pmem0s --show
numactl can be used to bind an application to the locality of a target
NVDIMM for better performance. A result of fio benchmark to ext4/dax
on an HP DL380 with 2 sockets for local & remote settings is shown
below.
Local [1] : 4098.3MB/s
Remote [2]: 3718.4MB/s
[1] numactl --preferred block:pmem0 --cpunodebind block:pmem0 fio <fs-on-pmem0>
[2] numactl --preferred block:pmem1 --cpunodebind block:pmem1 fio <fs-on-pmem0>
Patch 1/3 applies on top of the acpica branch of the pm tree.
Patch 2/3-3/3 apply on top of Dan Williams's patch series
"libnvdimm: ->rw_bytes(), BLK-mode, unit tests, and misc features".
---
v3:
- Update the comment of acpi_map_pxm_to_online_node(). (Rafael Wysocki)
v2:
- Add acpi_map_pxm_to_online_node(), which returns an online node.
- Manage visibility of sysfs numa_node with is_visible. (Dan Williams)
- Check ACPI_NFIT_PROXIMITY_VALID in spa->flags.
---
Toshi Kani (3):
1/3 acpi: Add acpi_map_pxm_to_online_node()
2/3 libnvdimm: Set numa_node to NVDIMM devices
3/3 libnvdimm: Add sysfs numa_node to NVDIMM devices
---
drivers/acpi/nfit.c | 7 ++++++
drivers/acpi/numa.c | 50 ++++++++++++++++++++++++++++++++++++++---
drivers/nvdimm/btt.c | 2 ++
drivers/nvdimm/btt_devs.c | 1 +
drivers/nvdimm/bus.c | 30 +++++++++++++++++++++++++
drivers/nvdimm/namespace_devs.c | 1 +
drivers/nvdimm/nd.h | 1 +
drivers/nvdimm/region.c | 1 +
drivers/nvdimm/region_devs.c | 1 +
include/linux/acpi.h | 5 +++++
include/linux/libnvdimm.h | 2 ++
11 files changed, 98 insertions(+), 3 deletions(-)
The kernel initializes CPU & memory's NUMA topology from ACPI
SRAT table. Some other ACPI tables, such as NFIT and DMAR, also
contain proximity IDs for their device's NUMA topology. This
information can be used to improve performance of these devices.
This patch introduces acpi_map_pxm_to_online_node(), which is
similar to acpi_map_pxm_to_node(), but always returns an on-line
node. When the mapped node from a given proximity ID is off-line,
it looks up the node distance table and returns the nearest
on-line node.
ACPI device drivers, which are called after the NUMA initialization
has completed in the kernel, can call this interface to obtain their
device NUMA topology from ACPI tables. Such drivers do not have to
deal with off-line nodes. A node may be off-line when a device
proximity ID is unique, SRAT memory entry does not exist, or NUMA is
disabled, ex. "numa=off" on x86.
This patch also moves the pxm range check from acpi_get_node() to
acpi_map_pxm_to_node().
Signed-off-by: Toshi Kani <[email protected]>
---
drivers/acpi/numa.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
include/linux/acpi.h | 5 +++++
2 files changed, 52 insertions(+), 3 deletions(-)
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 1333cbdc..0b6e3a0 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -29,6 +29,8 @@
#include <linux/errno.h>
#include <linux/acpi.h>
#include <linux/numa.h>
+#include <linux/nodemask.h>
+#include <linux/topology.h>
#define PREFIX "ACPI: "
@@ -70,7 +72,12 @@ static void __acpi_map_pxm_to_node(int pxm, int node)
int acpi_map_pxm_to_node(int pxm)
{
- int node = pxm_to_node_map[pxm];
+ int node;
+
+ if (pxm < 0 || pxm >= MAX_PXM_DOMAINS)
+ return NUMA_NO_NODE;
+
+ node = pxm_to_node_map[pxm];
if (node == NUMA_NO_NODE) {
if (nodes_weight(nodes_found_map) >= MAX_NUMNODES)
@@ -83,6 +90,45 @@ int acpi_map_pxm_to_node(int pxm)
return node;
}
+/**
+ * acpi_map_pxm_to_online_node - Map proximity ID to on-line node
+ * @pxm: ACPI proximity ID
+ *
+ * This is similar to acpi_map_pxm_to_node(), but always returns an on-line
+ * node. When the mapped node from a given proximity ID is off-line, it
+ * looks up the node distance table and returns the nearest on-line node.
+ *
+ * ACPI device drivers, which are called after the NUMA initialization has
+ * completed in the kernel, can call this interface to obtain their device
+ * NUMA topology from ACPI tables. Such drivers do not have to deal with
+ * off-line nodes. A node may be off-line when a device proximity ID is
+ * unique, SRAT memory entry does not exist, or NUMA is disabled, ex.
+ * "numa=off" on x86.
+ */
+int acpi_map_pxm_to_online_node(int pxm)
+{
+ int node, n, dist, min_dist;
+
+ node = acpi_map_pxm_to_node(pxm);
+
+ if (node == NUMA_NO_NODE)
+ node = 0;
+
+ if (!node_online(node)) {
+ min_dist = INT_MAX;
+ for_each_online_node(n) {
+ dist = node_distance(node, n);
+ if (dist < min_dist) {
+ min_dist = dist;
+ node = n;
+ }
+ }
+ }
+
+ return node;
+}
+EXPORT_SYMBOL(acpi_map_pxm_to_online_node);
+
static void __init
acpi_table_print_srat_entry(struct acpi_subtable_header *header)
{
@@ -328,8 +374,6 @@ int acpi_get_node(acpi_handle handle)
int pxm;
pxm = acpi_get_pxm(handle);
- if (pxm < 0 || pxm >= MAX_PXM_DOMAINS)
- return NUMA_NO_NODE;
return acpi_map_pxm_to_node(pxm);
}
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index e4da5e3..1b3bbb1 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -289,8 +289,13 @@ extern void acpi_dmi_osi_linux(int enable, const struct dmi_system_id *d);
extern void acpi_osi_setup(char *str);
#ifdef CONFIG_ACPI_NUMA
+int acpi_map_pxm_to_online_node(int pxm);
int acpi_get_node(acpi_handle handle);
#else
+static inline int acpi_map_pxm_to_online_node(int pxm)
+{
+ return 0;
+}
static inline int acpi_get_node(acpi_handle handle)
{
return 0;
ACPI NFIT table has System Physical Address Range Structure
entries that describe a proximity ID of each range when
ACPI_NFIT_PROXIMITY_VALID is set in the flags.
Change acpi_nfit_register_region() to map a proximity ID to its
node ID, and set it to a new numa_node field of nd_region_desc,
which is then conveyed to nd_region.
nd_region_probe() and nd_btt_probe() set the numa_node of nd_region
to their device object being probed. A namespace device inherits
the numa_node from the parent region device.
Signed-off-by: Toshi Kani <[email protected]>
---
drivers/acpi/nfit.c | 6 ++++++
drivers/nvdimm/btt.c | 2 ++
drivers/nvdimm/nd.h | 1 +
drivers/nvdimm/region.c | 1 +
drivers/nvdimm/region_devs.c | 1 +
include/linux/libnvdimm.h | 1 +
6 files changed, 12 insertions(+)
diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 5f64582..5997753 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -1392,6 +1392,12 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
ndr_desc->res = &res;
ndr_desc->provider_data = nfit_spa;
ndr_desc->attr_groups = acpi_nfit_region_attribute_groups;
+ if (spa->flags & ACPI_NFIT_PROXIMITY_VALID)
+ ndr_desc->numa_node = acpi_map_pxm_to_online_node(
+ spa->proximity_domain);
+ else
+ ndr_desc->numa_node = NUMA_NO_NODE;
+
list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev;
struct nd_mapping *nd_mapping;
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 57d3b27..ab082e5 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1495,6 +1495,8 @@ static int nd_btt_probe(struct device *dev)
rc = -ENOMEM;
goto err_btt;
}
+
+ set_dev_node(dev, nd_region->numa_node);
dev_set_drvdata(dev, btt);
return 0;
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 011d7c5..0bfd20a 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -93,6 +93,7 @@ struct nd_region {
u64 ndr_size;
u64 ndr_start;
int id, num_lanes;
+ int numa_node;
void *provider_data;
struct nd_interleave_set *nd_set;
struct nd_mapping mapping[0];
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index d9d82e7..a764ca6 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -123,6 +123,7 @@ static int nd_region_probe(struct device *dev)
num_ns->active = rc;
num_ns->count = rc + err;
+ set_dev_node(dev, nd_region->numa_node);
dev_set_drvdata(dev, num_ns);
if (err == 0)
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index bb9f329..703dfae 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -632,6 +632,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
nd_region->provider_data = ndr_desc->provider_data;
nd_region->nd_set = ndr_desc->nd_set;
nd_region->num_lanes = ndr_desc->num_lanes;
+ nd_region->numa_node = ndr_desc->numa_node;
ida_init(&nd_region->ns_ida);
dev = &nd_region->dev;
dev_set_name(dev, "region%d", nd_region->id);
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index dc799a2..30b3dea 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -89,6 +89,7 @@ struct nd_region_desc {
struct nd_interleave_set *nd_set;
void *provider_data;
int num_lanes;
+ int numa_node;
};
struct nvdimm_bus;
Add support of sysfs 'numa_node' to I/O-related NVDIMM devices
under /sys/bus/nd/devices, regionN, namespaceN.0, and bttN.
When bttN is not set up, its numa_node returns -1 (NUMA_NO_NODE).
An example of numa_node values on a 2-socket system with a single
NVDIMM range on each socket is shown below.
/sys/bus/nd/devices
|-- btt0/numa_node:-1
|-- btt1/numa_node:0
|-- namespace0.0/numa_node:0
|-- namespace1.0/numa_node:1
|-- region0/numa_node:0
|-- region1/numa_node:1
These numa_node files are then linked under the block class of
their device names.
/sys/class/block/pmem0/device/numa_node:0
/sys/class/block/pmem0s/device/numa_node:0
/sys/class/block/pmem1/device/numa_node:1
This enables numactl(8) to accept 'block:' and 'file:' paths of
pmem and btt devices as shown in the examples below.
numactl --preferred block:pmem0 --show
numactl --preferred file:/dev/pmem0s --show
Signed-off-by: Toshi Kani <[email protected]>
---
drivers/acpi/nfit.c | 1 +
drivers/nvdimm/btt_devs.c | 1 +
drivers/nvdimm/bus.c | 30 ++++++++++++++++++++++++++++++
drivers/nvdimm/namespace_devs.c | 1 +
include/linux/libnvdimm.h | 1 +
5 files changed, 34 insertions(+)
diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 5997753..9cb63ac 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -873,6 +873,7 @@ static const struct attribute_group *acpi_nfit_region_attribute_groups[] = {
&nd_region_attribute_group,
&nd_mapping_attribute_group,
&nd_device_attribute_group,
+ &nd_numa_attribute_group,
&acpi_nfit_region_attribute_group,
NULL,
};
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
index bcf77dc..a7b192f 100644
--- a/drivers/nvdimm/btt_devs.c
+++ b/drivers/nvdimm/btt_devs.c
@@ -308,6 +308,7 @@ static struct attribute_group nd_btt_attribute_group = {
static const struct attribute_group *nd_btt_attribute_groups[] = {
&nd_btt_attribute_group,
&nd_device_attribute_group,
+ &nd_numa_attribute_group,
NULL,
};
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 67525f9..03c0ee1 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -420,6 +420,36 @@ struct attribute_group nd_device_attribute_group = {
};
EXPORT_SYMBOL_GPL(nd_device_attribute_group);
+static ssize_t numa_node_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", dev_to_node(dev));
+}
+static DEVICE_ATTR_RO(numa_node);
+
+static struct attribute *nd_numa_attributes[] = {
+ &dev_attr_numa_node.attr,
+ NULL,
+};
+
+static umode_t nd_numa_attr_visible(struct kobject *kobj, struct attribute *a,
+ int n)
+{
+ if (!IS_ENABLED(CONFIG_NUMA))
+ return 0;
+
+ return a->mode;
+}
+
+/**
+ * nd_numa_attribute_group - NUMA attributes for all devices on an nd bus
+ */
+struct attribute_group nd_numa_attribute_group = {
+ .attrs = nd_numa_attributes,
+ .is_visible = nd_numa_attr_visible,
+};
+EXPORT_SYMBOL_GPL(nd_numa_attribute_group);
+
int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus)
{
dev_t devt = MKDEV(nvdimm_bus_major, nvdimm_bus->id);
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 0fe541a..47f3f29 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1141,6 +1141,7 @@ static struct attribute_group nd_namespace_attribute_group = {
static const struct attribute_group *nd_namespace_attribute_groups[] = {
&nd_device_attribute_group,
&nd_namespace_attribute_group,
+ &nd_numa_attribute_group,
NULL,
};
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 30b3dea..75e3af0 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -38,6 +38,7 @@ enum {
extern struct attribute_group nvdimm_bus_attribute_group;
extern struct attribute_group nvdimm_attribute_group;
extern struct attribute_group nd_device_attribute_group;
+extern struct attribute_group nd_numa_attribute_group;
extern struct attribute_group nd_region_attribute_group;
extern struct attribute_group nd_mapping_attribute_group;
Do you have local_cpus and local_cpulist attributes as well?
User-space tools such as hwloc use those for binding near I/O devices,
although I guess we could have some CPU-less NVDIMM NUMA nodes?
Brice
Le 19/06/2015 20:18, Toshi Kani a ?crit :
> Add support of sysfs 'numa_node' to I/O-related NVDIMM devices
> under /sys/bus/nd/devices, regionN, namespaceN.0, and bttN.
> When bttN is not set up, its numa_node returns -1 (NUMA_NO_NODE).
>
> An example of numa_node values on a 2-socket system with a single
> NVDIMM range on each socket is shown below.
> /sys/bus/nd/devices
> |-- btt0/numa_node:-1
> |-- btt1/numa_node:0
> |-- namespace0.0/numa_node:0
> |-- namespace1.0/numa_node:1
> |-- region0/numa_node:0
> |-- region1/numa_node:1
>
> These numa_node files are then linked under the block class of
> their device names.
> /sys/class/block/pmem0/device/numa_node:0
> /sys/class/block/pmem0s/device/numa_node:0
> /sys/class/block/pmem1/device/numa_node:1
>
> This enables numactl(8) to accept 'block:' and 'file:' paths of
> pmem and btt devices as shown in the examples below.
> numactl --preferred block:pmem0 --show
> numactl --preferred file:/dev/pmem0s --show
>
> Signed-off-by: Toshi Kani <[email protected]>
> ---
> drivers/acpi/nfit.c | 1 +
> drivers/nvdimm/btt_devs.c | 1 +
> drivers/nvdimm/bus.c | 30 ++++++++++++++++++++++++++++++
> drivers/nvdimm/namespace_devs.c | 1 +
> include/linux/libnvdimm.h | 1 +
> 5 files changed, 34 insertions(+)
>
> diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
> index 5997753..9cb63ac 100644
> --- a/drivers/acpi/nfit.c
> +++ b/drivers/acpi/nfit.c
> @@ -873,6 +873,7 @@ static const struct attribute_group *acpi_nfit_region_attribute_groups[] = {
> &nd_region_attribute_group,
> &nd_mapping_attribute_group,
> &nd_device_attribute_group,
> + &nd_numa_attribute_group,
> &acpi_nfit_region_attribute_group,
> NULL,
> };
> diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
> index bcf77dc..a7b192f 100644
> --- a/drivers/nvdimm/btt_devs.c
> +++ b/drivers/nvdimm/btt_devs.c
> @@ -308,6 +308,7 @@ static struct attribute_group nd_btt_attribute_group = {
> static const struct attribute_group *nd_btt_attribute_groups[] = {
> &nd_btt_attribute_group,
> &nd_device_attribute_group,
> + &nd_numa_attribute_group,
> NULL,
> };
>
> diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
> index 67525f9..03c0ee1 100644
> --- a/drivers/nvdimm/bus.c
> +++ b/drivers/nvdimm/bus.c
> @@ -420,6 +420,36 @@ struct attribute_group nd_device_attribute_group = {
> };
> EXPORT_SYMBOL_GPL(nd_device_attribute_group);
>
> +static ssize_t numa_node_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + return sprintf(buf, "%d\n", dev_to_node(dev));
> +}
> +static DEVICE_ATTR_RO(numa_node);
> +
> +static struct attribute *nd_numa_attributes[] = {
> + &dev_attr_numa_node.attr,
> + NULL,
> +};
> +
> +static umode_t nd_numa_attr_visible(struct kobject *kobj, struct attribute *a,
> + int n)
> +{
> + if (!IS_ENABLED(CONFIG_NUMA))
> + return 0;
> +
> + return a->mode;
> +}
> +
> +/**
> + * nd_numa_attribute_group - NUMA attributes for all devices on an nd bus
> + */
> +struct attribute_group nd_numa_attribute_group = {
> + .attrs = nd_numa_attributes,
> + .is_visible = nd_numa_attr_visible,
> +};
> +EXPORT_SYMBOL_GPL(nd_numa_attribute_group);
> +
> int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus)
> {
> dev_t devt = MKDEV(nvdimm_bus_major, nvdimm_bus->id);
> diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
> index 0fe541a..47f3f29 100644
> --- a/drivers/nvdimm/namespace_devs.c
> +++ b/drivers/nvdimm/namespace_devs.c
> @@ -1141,6 +1141,7 @@ static struct attribute_group nd_namespace_attribute_group = {
> static const struct attribute_group *nd_namespace_attribute_groups[] = {
> &nd_device_attribute_group,
> &nd_namespace_attribute_group,
> + &nd_numa_attribute_group,
> NULL,
> };
>
> diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
> index 30b3dea..75e3af0 100644
> --- a/include/linux/libnvdimm.h
> +++ b/include/linux/libnvdimm.h
> @@ -38,6 +38,7 @@ enum {
> extern struct attribute_group nvdimm_bus_attribute_group;
> extern struct attribute_group nvdimm_attribute_group;
> extern struct attribute_group nd_device_attribute_group;
> +extern struct attribute_group nd_numa_attribute_group;
> extern struct attribute_group nd_region_attribute_group;
> extern struct attribute_group nd_mapping_attribute_group;
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> Please read the FAQ at http://www.tux.org/lkml/
>
On Fri, 2015-06-19 at 22:01 +0200, Brice Goglin wrote:
> Do you have local_cpus and local_cpulist attributes as well?
> User-space tools such as hwloc use those for binding near I/O devices,
> although I guess we could have some CPU-less NVDIMM NUMA nodes?
No, the patch does not create local_cpus and local_cpulist. I will look
into hwloc, and support it as a next item if it makes sense to NVDIMM.
Thanks for the input!
-Toshi
On Sat, 2015-06-20 at 01:26 +0200, Rafael J. Wysocki wrote:
> On Friday, June 19, 2015 12:18:32 PM Toshi Kani wrote:
:
> > +/**
> > + * acpi_map_pxm_to_online_node - Map proximity ID to on-line node
> > + * @pxm: ACPI proximity ID
> > + *
> > + * This is similar to acpi_map_pxm_to_node(), but always returns an on-line
> > + * node. When the mapped node from a given proximity ID is off-line, it
> > + * looks up the node distance table and returns the nearest on-line node.
> > + *
> > + * ACPI device drivers, which are called after the NUMA initialization has
> > + * completed in the kernel, can call this interface to obtain their device
> > + * NUMA topology from ACPI tables. Such drivers do not have to deal with
> > + * off-line nodes. A node may be off-line when a device proximity ID is
> > + * unique, SRAT memory entry does not exist, or NUMA is disabled, ex.
> > + * "numa=off" on x86.
>
> The dash in "off-line" and "on-line" is not needed, we spell those things as
> "offline" and "online" as a rule.
>
> It looks good to me apart from this small detail.
Great! I will send [PATCH v3-UPDATE 1/3] with this update.
Thanks,
-Toshi
On Friday, June 19, 2015 12:18:32 PM Toshi Kani wrote:
> The kernel initializes CPU & memory's NUMA topology from ACPI
> SRAT table. Some other ACPI tables, such as NFIT and DMAR, also
> contain proximity IDs for their device's NUMA topology. This
> information can be used to improve performance of these devices.
>
> This patch introduces acpi_map_pxm_to_online_node(), which is
> similar to acpi_map_pxm_to_node(), but always returns an on-line
> node. When the mapped node from a given proximity ID is off-line,
> it looks up the node distance table and returns the nearest
> on-line node.
>
> ACPI device drivers, which are called after the NUMA initialization
> has completed in the kernel, can call this interface to obtain their
> device NUMA topology from ACPI tables. Such drivers do not have to
> deal with off-line nodes. A node may be off-line when a device
> proximity ID is unique, SRAT memory entry does not exist, or NUMA is
> disabled, ex. "numa=off" on x86.
>
> This patch also moves the pxm range check from acpi_get_node() to
> acpi_map_pxm_to_node().
>
> Signed-off-by: Toshi Kani <[email protected]>
> ---
> drivers/acpi/numa.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
> include/linux/acpi.h | 5 +++++
> 2 files changed, 52 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
> index 1333cbdc..0b6e3a0 100644
> --- a/drivers/acpi/numa.c
> +++ b/drivers/acpi/numa.c
> @@ -29,6 +29,8 @@
> #include <linux/errno.h>
> #include <linux/acpi.h>
> #include <linux/numa.h>
> +#include <linux/nodemask.h>
> +#include <linux/topology.h>
>
> #define PREFIX "ACPI: "
>
> @@ -70,7 +72,12 @@ static void __acpi_map_pxm_to_node(int pxm, int node)
>
> int acpi_map_pxm_to_node(int pxm)
> {
> - int node = pxm_to_node_map[pxm];
> + int node;
> +
> + if (pxm < 0 || pxm >= MAX_PXM_DOMAINS)
> + return NUMA_NO_NODE;
> +
> + node = pxm_to_node_map[pxm];
>
> if (node == NUMA_NO_NODE) {
> if (nodes_weight(nodes_found_map) >= MAX_NUMNODES)
> @@ -83,6 +90,45 @@ int acpi_map_pxm_to_node(int pxm)
> return node;
> }
>
> +/**
> + * acpi_map_pxm_to_online_node - Map proximity ID to on-line node
> + * @pxm: ACPI proximity ID
> + *
> + * This is similar to acpi_map_pxm_to_node(), but always returns an on-line
> + * node. When the mapped node from a given proximity ID is off-line, it
> + * looks up the node distance table and returns the nearest on-line node.
> + *
> + * ACPI device drivers, which are called after the NUMA initialization has
> + * completed in the kernel, can call this interface to obtain their device
> + * NUMA topology from ACPI tables. Such drivers do not have to deal with
> + * off-line nodes. A node may be off-line when a device proximity ID is
> + * unique, SRAT memory entry does not exist, or NUMA is disabled, ex.
> + * "numa=off" on x86.
The dash in "off-line" and "on-line" is not needed, we spell those things as
"offline" and "online" as a rule.
It looks good to me apart from this small detail.
Thanks,
Rafael
On Fri, Jun 19, 2015 at 11:18 AM, Toshi Kani <[email protected]> wrote:
> Add support of sysfs 'numa_node' to I/O-related NVDIMM devices
> under /sys/bus/nd/devices, regionN, namespaceN.0, and bttN.
> When bttN is not set up, its numa_node returns -1 (NUMA_NO_NODE).
>
> An example of numa_node values on a 2-socket system with a single
> NVDIMM range on each socket is shown below.
> /sys/bus/nd/devices
> |-- btt0/numa_node:-1
> |-- btt1/numa_node:0
> |-- namespace0.0/numa_node:0
> |-- namespace1.0/numa_node:1
> |-- region0/numa_node:0
> |-- region1/numa_node:1
>
> These numa_node files are then linked under the block class of
> their device names.
> /sys/class/block/pmem0/device/numa_node:0
> /sys/class/block/pmem0s/device/numa_node:0
> /sys/class/block/pmem1/device/numa_node:1
>
> This enables numactl(8) to accept 'block:' and 'file:' paths of
> pmem and btt devices as shown in the examples below.
> numactl --preferred block:pmem0 --show
> numactl --preferred file:/dev/pmem0s --show
>
> Signed-off-by: Toshi Kani <[email protected]>
> ---
> drivers/acpi/nfit.c | 1 +
> drivers/nvdimm/btt_devs.c | 1 +
> drivers/nvdimm/bus.c | 30 ++++++++++++++++++++++++++++++
> drivers/nvdimm/namespace_devs.c | 1 +
> include/linux/libnvdimm.h | 1 +
> 5 files changed, 34 insertions(+)
>
> diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
> index 5997753..9cb63ac 100644
> --- a/drivers/acpi/nfit.c
> +++ b/drivers/acpi/nfit.c
> @@ -873,6 +873,7 @@ static const struct attribute_group *acpi_nfit_region_attribute_groups[] = {
> &nd_region_attribute_group,
> &nd_mapping_attribute_group,
> &nd_device_attribute_group,
> + &nd_numa_attribute_group,
> &acpi_nfit_region_attribute_group,
> NULL,
> };
> diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
> index bcf77dc..a7b192f 100644
> --- a/drivers/nvdimm/btt_devs.c
> +++ b/drivers/nvdimm/btt_devs.c
> @@ -308,6 +308,7 @@ static struct attribute_group nd_btt_attribute_group = {
> static const struct attribute_group *nd_btt_attribute_groups[] = {
> &nd_btt_attribute_group,
> &nd_device_attribute_group,
> + &nd_numa_attribute_group,
> NULL,
> };
>
> diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
> index 67525f9..03c0ee1 100644
> --- a/drivers/nvdimm/bus.c
> +++ b/drivers/nvdimm/bus.c
> @@ -420,6 +420,36 @@ struct attribute_group nd_device_attribute_group = {
> };
> EXPORT_SYMBOL_GPL(nd_device_attribute_group);
>
> +static ssize_t numa_node_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + return sprintf(buf, "%d\n", dev_to_node(dev));
> +}
So patch 2 collided with the requested BTT stacking rework and
prompted me to take a closer look. Shouldn't numa_node_show() be
changed like this?
@@ -273,7 +273,12 @@ EXPORT_SYMBOL_GPL(nd_device_attribute_group);
static ssize_t numa_node_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", dev_to_node(dev));
+ if (is_nd_region(dev))
+ return sprintf(buf, "%d\n", dev_to_node(dev));
+ else if (is_nd_region(dev->parent))
+ return sprintf(buf, "%d\n", dev_to_node(dev->parent));
+ else
+ return sprintf(buf, "-1\n");
}
static DEVICE_ATTR_RO(numa_node);
On Tue, Jun 23, 2015 at 5:26 PM, Dan Williams <[email protected]> wrote:
> On Fri, Jun 19, 2015 at 11:18 AM, Toshi Kani <[email protected]> wrote:
>> Add support of sysfs 'numa_node' to I/O-related NVDIMM devices
>> under /sys/bus/nd/devices, regionN, namespaceN.0, and bttN.
>> When bttN is not set up, its numa_node returns -1 (NUMA_NO_NODE).
>>
>> An example of numa_node values on a 2-socket system with a single
>> NVDIMM range on each socket is shown below.
>> /sys/bus/nd/devices
>> |-- btt0/numa_node:-1
>> |-- btt1/numa_node:0
>> |-- namespace0.0/numa_node:0
>> |-- namespace1.0/numa_node:1
>> |-- region0/numa_node:0
>> |-- region1/numa_node:1
>>
>> These numa_node files are then linked under the block class of
>> their device names.
>> /sys/class/block/pmem0/device/numa_node:0
>> /sys/class/block/pmem0s/device/numa_node:0
>> /sys/class/block/pmem1/device/numa_node:1
>>
>> This enables numactl(8) to accept 'block:' and 'file:' paths of
>> pmem and btt devices as shown in the examples below.
>> numactl --preferred block:pmem0 --show
>> numactl --preferred file:/dev/pmem0s --show
>>
>> Signed-off-by: Toshi Kani <[email protected]>
>> ---
>> drivers/acpi/nfit.c | 1 +
>> drivers/nvdimm/btt_devs.c | 1 +
>> drivers/nvdimm/bus.c | 30 ++++++++++++++++++++++++++++++
>> drivers/nvdimm/namespace_devs.c | 1 +
>> include/linux/libnvdimm.h | 1 +
>> 5 files changed, 34 insertions(+)
>>
>> diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
>> index 5997753..9cb63ac 100644
>> --- a/drivers/acpi/nfit.c
>> +++ b/drivers/acpi/nfit.c
>> @@ -873,6 +873,7 @@ static const struct attribute_group *acpi_nfit_region_attribute_groups[] = {
>> &nd_region_attribute_group,
>> &nd_mapping_attribute_group,
>> &nd_device_attribute_group,
>> + &nd_numa_attribute_group,
>> &acpi_nfit_region_attribute_group,
>> NULL,
>> };
>> diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
>> index bcf77dc..a7b192f 100644
>> --- a/drivers/nvdimm/btt_devs.c
>> +++ b/drivers/nvdimm/btt_devs.c
>> @@ -308,6 +308,7 @@ static struct attribute_group nd_btt_attribute_group = {
>> static const struct attribute_group *nd_btt_attribute_groups[] = {
>> &nd_btt_attribute_group,
>> &nd_device_attribute_group,
>> + &nd_numa_attribute_group,
>> NULL,
>> };
>>
>> diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
>> index 67525f9..03c0ee1 100644
>> --- a/drivers/nvdimm/bus.c
>> +++ b/drivers/nvdimm/bus.c
>> @@ -420,6 +420,36 @@ struct attribute_group nd_device_attribute_group = {
>> };
>> EXPORT_SYMBOL_GPL(nd_device_attribute_group);
>>
>> +static ssize_t numa_node_show(struct device *dev,
>> + struct device_attribute *attr, char *buf)
>> +{
>> + return sprintf(buf, "%d\n", dev_to_node(dev));
>> +}
>
> So patch 2 collided with the requested BTT stacking rework and
> prompted me to take a closer look. Shouldn't numa_node_show() be
> changed like this?
>
> @@ -273,7 +273,12 @@ EXPORT_SYMBOL_GPL(nd_device_attribute_group);
> static ssize_t numa_node_show(struct device *dev,
> struct device_attribute *attr, char *buf)
> {
> - return sprintf(buf, "%d\n", dev_to_node(dev));
> + if (is_nd_region(dev))
> + return sprintf(buf, "%d\n", dev_to_node(dev));
> + else if (is_nd_region(dev->parent))
> + return sprintf(buf, "%d\n", dev_to_node(dev->parent));
> + else
> + return sprintf(buf, "-1\n");
> }
> static DEVICE_ATTR_RO(numa_node);
Where is_nd_region() is:
+static bool is_nd_region(struct device *dev)
+{
+ if (is_nd_pmem(dev) || is_nd_blk(dev))
+ return true;
+ return false;
+}
+
On Tue, 2015-06-23 at 17:26 -0700, Dan Williams wrote:
> On Fri, Jun 19, 2015 at 11:18 AM, Toshi Kani <[email protected]> wrote:
> > Add support of sysfs 'numa_node' to I/O-related NVDIMM devices
> > under /sys/bus/nd/devices, regionN, namespaceN.0, and bttN.
> > When bttN is not set up, its numa_node returns -1 (NUMA_NO_NODE).
> >
> > An example of numa_node values on a 2-socket system with a single
> > NVDIMM range on each socket is shown below.
> > /sys/bus/nd/devices
> > |-- btt0/numa_node:-1
> > |-- btt1/numa_node:0
> > |-- namespace0.0/numa_node:0
> > |-- namespace1.0/numa_node:1
> > |-- region0/numa_node:0
> > |-- region1/numa_node:1
> >
> > These numa_node files are then linked under the block class of
> > their device names.
> > /sys/class/block/pmem0/device/numa_node:0
> > /sys/class/block/pmem0s/device/numa_node:0
> > /sys/class/block/pmem1/device/numa_node:1
> >
> > This enables numactl(8) to accept 'block:' and 'file:' paths of
> > pmem and btt devices as shown in the examples below.
> > numactl --preferred block:pmem0 --show
> > numactl --preferred file:/dev/pmem0s --show
> >
> > Signed-off-by: Toshi Kani <[email protected]>
> > ---
> > drivers/acpi/nfit.c | 1 +
> > drivers/nvdimm/btt_devs.c | 1 +
> > drivers/nvdimm/bus.c | 30 ++++++++++++++++++++++++++++++
> > drivers/nvdimm/namespace_devs.c | 1 +
> > include/linux/libnvdimm.h | 1 +
> > 5 files changed, 34 insertions(+)
> >
> > diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
> > index 5997753..9cb63ac 100644
> > --- a/drivers/acpi/nfit.c
> > +++ b/drivers/acpi/nfit.c
> > @@ -873,6 +873,7 @@ static const struct attribute_group *acpi_nfit_region_attribute_groups[] = {
> > &nd_region_attribute_group,
> > &nd_mapping_attribute_group,
> > &nd_device_attribute_group,
> > + &nd_numa_attribute_group,
> > &acpi_nfit_region_attribute_group,
> > NULL,
> > };
> > diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
> > index bcf77dc..a7b192f 100644
> > --- a/drivers/nvdimm/btt_devs.c
> > +++ b/drivers/nvdimm/btt_devs.c
> > @@ -308,6 +308,7 @@ static struct attribute_group nd_btt_attribute_group = {
> > static const struct attribute_group *nd_btt_attribute_groups[] = {
> > &nd_btt_attribute_group,
> > &nd_device_attribute_group,
> > + &nd_numa_attribute_group,
> > NULL,
> > };
> >
> > diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
> > index 67525f9..03c0ee1 100644
> > --- a/drivers/nvdimm/bus.c
> > +++ b/drivers/nvdimm/bus.c
> > @@ -420,6 +420,36 @@ struct attribute_group nd_device_attribute_group = {
> > };
> > EXPORT_SYMBOL_GPL(nd_device_attribute_group);
> >
> > +static ssize_t numa_node_show(struct device *dev,
> > + struct device_attribute *attr, char *buf)
> > +{
> > + return sprintf(buf, "%d\n", dev_to_node(dev));
> > +}
>
> So patch 2 collided with the requested BTT stacking rework and
> prompted me to take a closer look. Shouldn't numa_node_show() be
> changed like this?
numa_node_show() is listed in its own nd_numa_attribute_group for using
is_visible. This nd_numa_attribute_group is then listed by region
(acpi_nfit_region_attribute_groups), namespace
(nd_namespace_attribute_groups), and btt (nd_btt_attribute_groups).
Therefore, numa_node_show() is only called with these device objects.
So, I do not think we need such change. Or are you suggesting to change
the way attribute group is set?
Thanks,
-Toshi
> @@ -273,7 +273,12 @@ EXPORT_SYMBOL_GPL(nd_device_attribute_group);
> static ssize_t numa_node_show(struct device *dev,
> struct device_attribute *attr, char *buf)
> {
> - return sprintf(buf, "%d\n", dev_to_node(dev));
> + if (is_nd_region(dev))
> + return sprintf(buf, "%d\n", dev_to_node(dev));
> + else if (is_nd_region(dev->parent))
> + return sprintf(buf, "%d\n", dev_to_node(dev->parent));
> + else
> + return sprintf(buf, "-1\n");
> }
> static DEVICE_ATTR_RO(numa_node);
On Wed, Jun 24, 2015 at 9:38 AM, Toshi Kani <[email protected]> wrote:
> On Tue, 2015-06-23 at 17:26 -0700, Dan Williams wrote:
>> On Fri, Jun 19, 2015 at 11:18 AM, Toshi Kani <[email protected]> wrote:
>> > Add support of sysfs 'numa_node' to I/O-related NVDIMM devices
>> > under /sys/bus/nd/devices, regionN, namespaceN.0, and bttN.
>> > When bttN is not set up, its numa_node returns -1 (NUMA_NO_NODE).
>> >
>> > An example of numa_node values on a 2-socket system with a single
>> > NVDIMM range on each socket is shown below.
>> > /sys/bus/nd/devices
>> > |-- btt0/numa_node:-1
>> > |-- btt1/numa_node:0
>> > |-- namespace0.0/numa_node:0
>> > |-- namespace1.0/numa_node:1
>> > |-- region0/numa_node:0
>> > |-- region1/numa_node:1
>> >
>> > These numa_node files are then linked under the block class of
>> > their device names.
>> > /sys/class/block/pmem0/device/numa_node:0
>> > /sys/class/block/pmem0s/device/numa_node:0
>> > /sys/class/block/pmem1/device/numa_node:1
>> >
>> > This enables numactl(8) to accept 'block:' and 'file:' paths of
>> > pmem and btt devices as shown in the examples below.
>> > numactl --preferred block:pmem0 --show
>> > numactl --preferred file:/dev/pmem0s --show
>> >
>> > Signed-off-by: Toshi Kani <[email protected]>
>> > ---
>> > drivers/acpi/nfit.c | 1 +
>> > drivers/nvdimm/btt_devs.c | 1 +
>> > drivers/nvdimm/bus.c | 30 ++++++++++++++++++++++++++++++
>> > drivers/nvdimm/namespace_devs.c | 1 +
>> > include/linux/libnvdimm.h | 1 +
>> > 5 files changed, 34 insertions(+)
>> >
>> > diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
>> > index 5997753..9cb63ac 100644
>> > --- a/drivers/acpi/nfit.c
>> > +++ b/drivers/acpi/nfit.c
>> > @@ -873,6 +873,7 @@ static const struct attribute_group *acpi_nfit_region_attribute_groups[] = {
>> > &nd_region_attribute_group,
>> > &nd_mapping_attribute_group,
>> > &nd_device_attribute_group,
>> > + &nd_numa_attribute_group,
>> > &acpi_nfit_region_attribute_group,
>> > NULL,
>> > };
>> > diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
>> > index bcf77dc..a7b192f 100644
>> > --- a/drivers/nvdimm/btt_devs.c
>> > +++ b/drivers/nvdimm/btt_devs.c
>> > @@ -308,6 +308,7 @@ static struct attribute_group nd_btt_attribute_group = {
>> > static const struct attribute_group *nd_btt_attribute_groups[] = {
>> > &nd_btt_attribute_group,
>> > &nd_device_attribute_group,
>> > + &nd_numa_attribute_group,
>> > NULL,
>> > };
>> >
>> > diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
>> > index 67525f9..03c0ee1 100644
>> > --- a/drivers/nvdimm/bus.c
>> > +++ b/drivers/nvdimm/bus.c
>> > @@ -420,6 +420,36 @@ struct attribute_group nd_device_attribute_group = {
>> > };
>> > EXPORT_SYMBOL_GPL(nd_device_attribute_group);
>> >
>> > +static ssize_t numa_node_show(struct device *dev,
>> > + struct device_attribute *attr, char *buf)
>> > +{
>> > + return sprintf(buf, "%d\n", dev_to_node(dev));
>> > +}
>>
>> So patch 2 collided with the requested BTT stacking rework and
>> prompted me to take a closer look. Shouldn't numa_node_show() be
>> changed like this?
>
> numa_node_show() is listed in its own nd_numa_attribute_group for using
> is_visible. This nd_numa_attribute_group is then listed by region
> (acpi_nfit_region_attribute_groups), namespace
> (nd_namespace_attribute_groups), and btt (nd_btt_attribute_groups).
> Therefore, numa_node_show() is only called with these device objects.
> So, I do not think we need such change. Or are you suggesting to change
> the way attribute group is set?
No, my mistake I missed this hunk in drivers/nvdimm/region.c
@@ -47,6 +47,7 @@ static int nd_region_probe(struct device *dev)
num_ns->active = rc;
num_ns->count = rc + err;
+ set_dev_node(dev, nd_region->numa_node);
dev_set_drvdata(dev, num_ns);
if (rc && err && rc == err)
On Fri, Jun 19, 2015 at 11:18 AM, Toshi Kani <[email protected]> wrote:
> ACPI NFIT table has System Physical Address Range Structure
> entries that describe a proximity ID of each range when
> ACPI_NFIT_PROXIMITY_VALID is set in the flags.
>
> Change acpi_nfit_register_region() to map a proximity ID to its
> node ID, and set it to a new numa_node field of nd_region_desc,
> which is then conveyed to nd_region.
>
> nd_region_probe() and nd_btt_probe() set the numa_node of nd_region
> to their device object being probed. A namespace device inherits
> the numa_node from the parent region device.
>
> Signed-off-by: Toshi Kani <[email protected]>
> ---
> drivers/acpi/nfit.c | 6 ++++++
> drivers/nvdimm/btt.c | 2 ++
> drivers/nvdimm/nd.h | 1 +
> drivers/nvdimm/region.c | 1 +
> drivers/nvdimm/region_devs.c | 1 +
> include/linux/libnvdimm.h | 1 +
> 6 files changed, 12 insertions(+)
>
> diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
> index 5f64582..5997753 100644
> --- a/drivers/acpi/nfit.c
> +++ b/drivers/acpi/nfit.c
> @@ -1392,6 +1392,12 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
> ndr_desc->res = &res;
> ndr_desc->provider_data = nfit_spa;
> ndr_desc->attr_groups = acpi_nfit_region_attribute_groups;
> + if (spa->flags & ACPI_NFIT_PROXIMITY_VALID)
> + ndr_desc->numa_node = acpi_map_pxm_to_online_node(
> + spa->proximity_domain);
> + else
> + ndr_desc->numa_node = NUMA_NO_NODE;
> +
> list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
> struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev;
> struct nd_mapping *nd_mapping;
> diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
> index 57d3b27..ab082e5 100644
> --- a/drivers/nvdimm/btt.c
> +++ b/drivers/nvdimm/btt.c
> @@ -1495,6 +1495,8 @@ static int nd_btt_probe(struct device *dev)
> rc = -ENOMEM;
> goto err_btt;
> }
> +
> + set_dev_node(dev, nd_region->numa_node);
> dev_set_drvdata(dev, btt);
>
> return 0;
> diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
> index 011d7c5..0bfd20a 100644
> --- a/drivers/nvdimm/nd.h
> +++ b/drivers/nvdimm/nd.h
> @@ -93,6 +93,7 @@ struct nd_region {
> u64 ndr_size;
> u64 ndr_start;
> int id, num_lanes;
> + int numa_node;
> void *provider_data;
> struct nd_interleave_set *nd_set;
> struct nd_mapping mapping[0];
> diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
> index d9d82e7..a764ca6 100644
> --- a/drivers/nvdimm/region.c
> +++ b/drivers/nvdimm/region.c
> @@ -123,6 +123,7 @@ static int nd_region_probe(struct device *dev)
>
> num_ns->active = rc;
> num_ns->count = rc + err;
> + set_dev_node(dev, nd_region->numa_node);
> dev_set_drvdata(dev, num_ns);
>
> if (err == 0)
> diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
> index bb9f329..703dfae 100644
> --- a/drivers/nvdimm/region_devs.c
> +++ b/drivers/nvdimm/region_devs.c
> @@ -632,6 +632,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
> nd_region->provider_data = ndr_desc->provider_data;
> nd_region->nd_set = ndr_desc->nd_set;
> nd_region->num_lanes = ndr_desc->num_lanes;
> + nd_region->numa_node = ndr_desc->numa_node;
Why introduce nd_region->numa_node? Why not do set_dev_node() directly here?
I can make this change locally if you agree, we don't need to wait
until probe to set these.
On Wed, 2015-06-24 at 09:50 -0700, Dan Williams wrote:
> On Fri, Jun 19, 2015 at 11:18 AM, Toshi Kani <[email protected]> wrote:
> > ACPI NFIT table has System Physical Address Range Structure
> > entries that describe a proximity ID of each range when
> > ACPI_NFIT_PROXIMITY_VALID is set in the flags.
> >
> > Change acpi_nfit_register_region() to map a proximity ID to its
> > node ID, and set it to a new numa_node field of nd_region_desc,
> > which is then conveyed to nd_region.
> >
> > nd_region_probe() and nd_btt_probe() set the numa_node of nd_region
> > to their device object being probed. A namespace device inherits
> > the numa_node from the parent region device.
> >
> > Signed-off-by: Toshi Kani <[email protected]>
> > ---
> > drivers/acpi/nfit.c | 6 ++++++
> > drivers/nvdimm/btt.c | 2 ++
> > drivers/nvdimm/nd.h | 1 +
> > drivers/nvdimm/region.c | 1 +
> > drivers/nvdimm/region_devs.c | 1 +
> > include/linux/libnvdimm.h | 1 +
> > 6 files changed, 12 insertions(+)
> >
> > diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
> > index 5f64582..5997753 100644
> > --- a/drivers/acpi/nfit.c
> > +++ b/drivers/acpi/nfit.c
> > @@ -1392,6 +1392,12 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
> > ndr_desc->res = &res;
> > ndr_desc->provider_data = nfit_spa;
> > ndr_desc->attr_groups = acpi_nfit_region_attribute_groups;
> > + if (spa->flags & ACPI_NFIT_PROXIMITY_VALID)
> > + ndr_desc->numa_node = acpi_map_pxm_to_online_node(
> > + spa->proximity_domain);
> > + else
> > + ndr_desc->numa_node = NUMA_NO_NODE;
> > +
> > list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
> > struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev;
> > struct nd_mapping *nd_mapping;
> > diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
> > index 57d3b27..ab082e5 100644
> > --- a/drivers/nvdimm/btt.c
> > +++ b/drivers/nvdimm/btt.c
> > @@ -1495,6 +1495,8 @@ static int nd_btt_probe(struct device *dev)
> > rc = -ENOMEM;
> > goto err_btt;
> > }
> > +
> > + set_dev_node(dev, nd_region->numa_node);
> > dev_set_drvdata(dev, btt);
> >
> > return 0;
> > diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
> > index 011d7c5..0bfd20a 100644
> > --- a/drivers/nvdimm/nd.h
> > +++ b/drivers/nvdimm/nd.h
> > @@ -93,6 +93,7 @@ struct nd_region {
> > u64 ndr_size;
> > u64 ndr_start;
> > int id, num_lanes;
> > + int numa_node;
> > void *provider_data;
> > struct nd_interleave_set *nd_set;
> > struct nd_mapping mapping[0];
> > diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
> > index d9d82e7..a764ca6 100644
> > --- a/drivers/nvdimm/region.c
> > +++ b/drivers/nvdimm/region.c
> > @@ -123,6 +123,7 @@ static int nd_region_probe(struct device *dev)
> >
> > num_ns->active = rc;
> > num_ns->count = rc + err;
> > + set_dev_node(dev, nd_region->numa_node);
> > dev_set_drvdata(dev, num_ns);
> >
> > if (err == 0)
> > diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
> > index bb9f329..703dfae 100644
> > --- a/drivers/nvdimm/region_devs.c
> > +++ b/drivers/nvdimm/region_devs.c
> > @@ -632,6 +632,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
> > nd_region->provider_data = ndr_desc->provider_data;
> > nd_region->nd_set = ndr_desc->nd_set;
> > nd_region->num_lanes = ndr_desc->num_lanes;
> > + nd_region->numa_node = ndr_desc->numa_node;
>
> Why introduce nd_region->numa_node? Why not do set_dev_node() directly here?
>
> I can make this change locally if you agree, we don't need to wait
> until probe to set these.
dev->numa_node cannot be set here because nd_region_create() then calls
nd_device_register() -> device_initialize() -> set_dev_node(dev, -1).
So, it gets overwritten with -1.
Thanks,
-Toshi
On Wed, Jun 24, 2015 at 10:05 AM, Toshi Kani <[email protected]> wrote:
> On Wed, 2015-06-24 at 09:50 -0700, Dan Williams wrote:
>> On Fri, Jun 19, 2015 at 11:18 AM, Toshi Kani <[email protected]> wrote:
>> > ACPI NFIT table has System Physical Address Range Structure
>> > entries that describe a proximity ID of each range when
>> > ACPI_NFIT_PROXIMITY_VALID is set in the flags.
>> >
>> > Change acpi_nfit_register_region() to map a proximity ID to its
>> > node ID, and set it to a new numa_node field of nd_region_desc,
>> > which is then conveyed to nd_region.
>> >
>> > nd_region_probe() and nd_btt_probe() set the numa_node of nd_region
>> > to their device object being probed. A namespace device inherits
>> > the numa_node from the parent region device.
>> >
>> > Signed-off-by: Toshi Kani <[email protected]>
>> > ---
>> > drivers/acpi/nfit.c | 6 ++++++
>> > drivers/nvdimm/btt.c | 2 ++
>> > drivers/nvdimm/nd.h | 1 +
>> > drivers/nvdimm/region.c | 1 +
>> > drivers/nvdimm/region_devs.c | 1 +
>> > include/linux/libnvdimm.h | 1 +
>> > 6 files changed, 12 insertions(+)
>> >
>> > diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
>> > index 5f64582..5997753 100644
>> > --- a/drivers/acpi/nfit.c
>> > +++ b/drivers/acpi/nfit.c
>> > @@ -1392,6 +1392,12 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
>> > ndr_desc->res = &res;
>> > ndr_desc->provider_data = nfit_spa;
>> > ndr_desc->attr_groups = acpi_nfit_region_attribute_groups;
>> > + if (spa->flags & ACPI_NFIT_PROXIMITY_VALID)
>> > + ndr_desc->numa_node = acpi_map_pxm_to_online_node(
>> > + spa->proximity_domain);
>> > + else
>> > + ndr_desc->numa_node = NUMA_NO_NODE;
>> > +
>> > list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
>> > struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev;
>> > struct nd_mapping *nd_mapping;
>> > diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
>> > index 57d3b27..ab082e5 100644
>> > --- a/drivers/nvdimm/btt.c
>> > +++ b/drivers/nvdimm/btt.c
>> > @@ -1495,6 +1495,8 @@ static int nd_btt_probe(struct device *dev)
>> > rc = -ENOMEM;
>> > goto err_btt;
>> > }
>> > +
>> > + set_dev_node(dev, nd_region->numa_node);
>> > dev_set_drvdata(dev, btt);
>> >
>> > return 0;
>> > diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
>> > index 011d7c5..0bfd20a 100644
>> > --- a/drivers/nvdimm/nd.h
>> > +++ b/drivers/nvdimm/nd.h
>> > @@ -93,6 +93,7 @@ struct nd_region {
>> > u64 ndr_size;
>> > u64 ndr_start;
>> > int id, num_lanes;
>> > + int numa_node;
>> > void *provider_data;
>> > struct nd_interleave_set *nd_set;
>> > struct nd_mapping mapping[0];
>> > diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
>> > index d9d82e7..a764ca6 100644
>> > --- a/drivers/nvdimm/region.c
>> > +++ b/drivers/nvdimm/region.c
>> > @@ -123,6 +123,7 @@ static int nd_region_probe(struct device *dev)
>> >
>> > num_ns->active = rc;
>> > num_ns->count = rc + err;
>> > + set_dev_node(dev, nd_region->numa_node);
>> > dev_set_drvdata(dev, num_ns);
>> >
>> > if (err == 0)
>> > diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
>> > index bb9f329..703dfae 100644
>> > --- a/drivers/nvdimm/region_devs.c
>> > +++ b/drivers/nvdimm/region_devs.c
>> > @@ -632,6 +632,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
>> > nd_region->provider_data = ndr_desc->provider_data;
>> > nd_region->nd_set = ndr_desc->nd_set;
>> > nd_region->num_lanes = ndr_desc->num_lanes;
>> > + nd_region->numa_node = ndr_desc->numa_node;
>>
>> Why introduce nd_region->numa_node? Why not do set_dev_node() directly here?
>>
>> I can make this change locally if you agree, we don't need to wait
>> until probe to set these.
>
> dev->numa_node cannot be set here because nd_region_create() then calls
> nd_device_register() -> device_initialize() -> set_dev_node(dev, -1).
> So, it gets overwritten with -1.
Ah, ok.
Still I'd rather set this permanent property of a device at create
time, so I'll re-work create to initialize the device and set the node
before registering.
On Wed, 2015-06-24 at 10:08 -0700, Dan Williams wrote:
> On Wed, Jun 24, 2015 at 10:05 AM, Toshi Kani <[email protected]> wrote:
> > On Wed, 2015-06-24 at 09:50 -0700, Dan Williams wrote:
> >> On Fri, Jun 19, 2015 at 11:18 AM, Toshi Kani <[email protected]> wrote:
> >> > ACPI NFIT table has System Physical Address Range Structure
> >> > entries that describe a proximity ID of each range when
> >> > ACPI_NFIT_PROXIMITY_VALID is set in the flags.
> >> >
> >> > Change acpi_nfit_register_region() to map a proximity ID to its
> >> > node ID, and set it to a new numa_node field of nd_region_desc,
> >> > which is then conveyed to nd_region.
> >> >
> >> > nd_region_probe() and nd_btt_probe() set the numa_node of nd_region
> >> > to their device object being probed. A namespace device inherits
> >> > the numa_node from the parent region device.
> >> >
> >> > Signed-off-by: Toshi Kani <[email protected]>
> >> > ---
> >> > drivers/acpi/nfit.c | 6 ++++++
> >> > drivers/nvdimm/btt.c | 2 ++
> >> > drivers/nvdimm/nd.h | 1 +
> >> > drivers/nvdimm/region.c | 1 +
> >> > drivers/nvdimm/region_devs.c | 1 +
> >> > include/linux/libnvdimm.h | 1 +
> >> > 6 files changed, 12 insertions(+)
> >> >
> >> > diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
> >> > index 5f64582..5997753 100644
> >> > --- a/drivers/acpi/nfit.c
> >> > +++ b/drivers/acpi/nfit.c
> >> > @@ -1392,6 +1392,12 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
> >> > ndr_desc->res = &res;
> >> > ndr_desc->provider_data = nfit_spa;
> >> > ndr_desc->attr_groups = acpi_nfit_region_attribute_groups;
> >> > + if (spa->flags & ACPI_NFIT_PROXIMITY_VALID)
> >> > + ndr_desc->numa_node = acpi_map_pxm_to_online_node(
> >> > + spa->proximity_domain);
> >> > + else
> >> > + ndr_desc->numa_node = NUMA_NO_NODE;
> >> > +
> >> > list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
> >> > struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev;
> >> > struct nd_mapping *nd_mapping;
> >> > diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
> >> > index 57d3b27..ab082e5 100644
> >> > --- a/drivers/nvdimm/btt.c
> >> > +++ b/drivers/nvdimm/btt.c
> >> > @@ -1495,6 +1495,8 @@ static int nd_btt_probe(struct device *dev)
> >> > rc = -ENOMEM;
> >> > goto err_btt;
> >> > }
> >> > +
> >> > + set_dev_node(dev, nd_region->numa_node);
> >> > dev_set_drvdata(dev, btt);
> >> >
> >> > return 0;
> >> > diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
> >> > index 011d7c5..0bfd20a 100644
> >> > --- a/drivers/nvdimm/nd.h
> >> > +++ b/drivers/nvdimm/nd.h
> >> > @@ -93,6 +93,7 @@ struct nd_region {
> >> > u64 ndr_size;
> >> > u64 ndr_start;
> >> > int id, num_lanes;
> >> > + int numa_node;
> >> > void *provider_data;
> >> > struct nd_interleave_set *nd_set;
> >> > struct nd_mapping mapping[0];
> >> > diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
> >> > index d9d82e7..a764ca6 100644
> >> > --- a/drivers/nvdimm/region.c
> >> > +++ b/drivers/nvdimm/region.c
> >> > @@ -123,6 +123,7 @@ static int nd_region_probe(struct device *dev)
> >> >
> >> > num_ns->active = rc;
> >> > num_ns->count = rc + err;
> >> > + set_dev_node(dev, nd_region->numa_node);
> >> > dev_set_drvdata(dev, num_ns);
> >> >
> >> > if (err == 0)
> >> > diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
> >> > index bb9f329..703dfae 100644
> >> > --- a/drivers/nvdimm/region_devs.c
> >> > +++ b/drivers/nvdimm/region_devs.c
> >> > @@ -632,6 +632,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
> >> > nd_region->provider_data = ndr_desc->provider_data;
> >> > nd_region->nd_set = ndr_desc->nd_set;
> >> > nd_region->num_lanes = ndr_desc->num_lanes;
> >> > + nd_region->numa_node = ndr_desc->numa_node;
> >>
> >> Why introduce nd_region->numa_node? Why not do set_dev_node() directly here?
> >>
> >> I can make this change locally if you agree, we don't need to wait
> >> until probe to set these.
> >
> > dev->numa_node cannot be set here because nd_region_create() then calls
> > nd_device_register() -> device_initialize() -> set_dev_node(dev, -1).
> > So, it gets overwritten with -1.
>
> Ah, ok.
>
> Still I'd rather set this permanent property of a device at create
> time, so I'll re-work create to initialize the device and set the node
> before registering.
I agree, it is cleaner in that way.
Thanks,
-Toshi