2023-11-30 21:49:45

by Srinivasulu Opensrc

[permalink] [raw]
Subject: [RFC PATCH 0/2] Node migration between memory tiers

From: Srinivasulu Thanneeru <[email protected]>

The memory tiers feature allows nodes with similar memory types
or performance characteristics to be grouped together in a
memory tier. However, there is currently no provision for
moving a node from one tier to another on demand.

This patch series aims to support node migration between tiers
on demand by sysadmin/root user using the provided sysfs for
node migration. Each tier has a start abstract distance(adistance)
and range.

To migrate a node to a tier, the corresponding node’s sysfs
adistance_offset is written with a value corresponding to
the tier’s adistance.

Example: Move node2 to memory tier5 from its default tier(i.e 4)

1. Check default values:
$cat /sys/devices/virtual/memory_tiering/memory_tier4/nodelist
0-2

$cat /sys/devices/system/node/node0/adistance_offset
0
$cat /sys/devices/system/node/node1/adistance_offset
0
$cat /sys/devices/system/node/node2/adistance_offset
0

2. Move node2 to tier5:

To move node2 from emory_tier4 (adistance=512) to
emory_tier5 (abstract=640), set the `adistance_offset` of
node 2 to 128 (i.e., 512 + 128 = 640).

Tier4 adistance start can be derved from tier-id
(i.e for tier4, 4 << 7 = 512).

$echo 128 > /sys/devices/system/node/node2/adistance_offset
$cat /sys/devices/system/node/node2/adistance_offset
128

3. Verify node2's tier id:

$cat /sys/devices/virtual/memory_tiering/memory_tier5/nodelist
2
$cat /sys/devices/virtual/memory_tiering/memory_tier4/nodelist
0-1

Srinivasulu Thanneeru (2):
base/node: Add sysfs for adistance_offset
memory tier: Support node migration between tiers

drivers/base/node.c | 51 +++++++++++++++++++++++
include/linux/memory-tiers.h | 11 +++++
include/linux/node.h | 6 +++
mm/memory-tiers.c | 79 ++++++++++++++++++++----------------
4 files changed, 113 insertions(+), 34 deletions(-)

--
2.25.1


2023-11-30 21:49:53

by Srinivasulu Opensrc

[permalink] [raw]
Subject: [RFC PATCH 2/2] memory tier: Support node migration between tiers

From: Srinivasulu Thanneeru <[email protected]>

Node migration enables the grouping or migration of nodes
between tiers based on nodes' latencies and bandwidth characteristics.
Since nodes of the same memory-type can exist in different tiers and can
migrate from one tier to another, it is necessary to maintain nodes
per tier instead of maintaining a list of nodes grouped using
memory type(siblings) within the tier.

To migrate a node from one tier to another, remove the node from the
current tier and insert it into the target tier. If the target tier does
not exist, create a new one.

Signed-off-by: Srinivasulu Thanneeru <[email protected]>
---
drivers/base/node.c | 6 ++++
include/linux/memory-tiers.h | 5 +++
include/linux/node.h | 5 +++
mm/memory-tiers.c | 65 +++++++++++++++++-------------------
4 files changed, 47 insertions(+), 34 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 1e63c692977b..8290ea96b439 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -608,10 +608,16 @@ static ssize_t adistance_offset_store(struct device *dev,
return -EINVAL;

node_devices[nid]->adistance_offset = value;
+ node_memtier_change(nid);
return size;
}
static DEVICE_ATTR_RW(adistance_offset);

+int get_node_adistance_offset(int nid)
+{
+ return node_devices[nid]->adistance_offset;
+}
+
static struct attribute *node_dev_attrs[] = {
&dev_attr_meminfo.attr,
&dev_attr_numastat.attr,
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index ff4e7136ab40..e86c23873334 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -49,6 +49,7 @@ int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
const char *source);
int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist);
int get_target_memtier_adistance(int node, int adistance_offset);
+void node_memtier_change(int node);
#ifdef CONFIG_MIGRATION
int next_demotion_node(int node);
void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -142,5 +143,9 @@ static int get_target_memtier_adistance(int node, int adistance_offset)
{
return 0;
}
+
+static inline void node_memtier_change(int node)
+{
+}
#endif /* CONFIG_NUMA */
#endif /* _LINUX_MEMORY_TIERS_H */
diff --git a/include/linux/node.h b/include/linux/node.h
index fd0f4f3177f8..5150215b4922 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -139,6 +139,7 @@ extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk);
extern int register_memory_node_under_compute_node(unsigned int mem_nid,
unsigned int cpu_nid,
unsigned access);
+extern int get_node_adistance_offset(int nid);
#else
static inline void node_dev_init(void)
{
@@ -166,6 +167,10 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
static inline void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
{
}
+static inline int get_node_adistance_offset(int nid)
+{
+ return 0;
+}
#endif

#define to_node(device) container_of(device, struct node, dev)
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index a40d4d4383d7..b6cd86977731 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -23,6 +23,8 @@ struct memory_tier {
struct device dev;
/* All the nodes that are part of all the lower memory tiers. */
nodemask_t lower_tier_mask;
+ /* Nodes linked to this tier*/
+ nodemask_t nodes;
};

struct demotion_nodes {
@@ -120,13 +122,7 @@ static inline struct memory_tier *to_memory_tier(struct device *device)

static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
{
- nodemask_t nodes = NODE_MASK_NONE;
- struct memory_dev_type *memtype;
-
- list_for_each_entry(memtype, &memtier->memory_types, tier_sibling)
- nodes_or(nodes, nodes, memtype->nodes);
-
- return nodes;
+ return memtier->nodes;
}

static void memory_tier_device_release(struct device *dev)
@@ -181,33 +177,22 @@ int get_target_memtier_adistance(int node, int adistance_offset)
return node_adistance;
}

-static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
+static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype,
+ int tier_adistance)
{
int ret;
bool found_slot = false;
struct memory_tier *memtier, *new_memtier;
- int adistance = memtype->adistance;
+ int adistance;
unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE;

lockdep_assert_held_once(&memory_tier_lock);

- adistance = round_down(adistance, memtier_adistance_chunk_size);
- /*
- * If the memtype is already part of a memory tier,
- * just return that.
- */
- if (!list_empty(&memtype->tier_sibling)) {
- list_for_each_entry(memtier, &memory_tiers, list) {
- if (adistance == memtier->adistance_start)
- return memtier;
- }
- WARN_ON(1);
- return ERR_PTR(-EINVAL);
- }
+ adistance = round_down(tier_adistance, memtier_adistance_chunk_size);

list_for_each_entry(memtier, &memory_tiers, list) {
if (adistance == memtier->adistance_start) {
- goto link_memtype;
+ return memtier;
} else if (adistance < memtier->adistance_start) {
found_slot = true;
break;
@@ -238,9 +223,6 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
return ERR_PTR(ret);
}
memtier = new_memtier;
-
-link_memtype:
- list_add(&memtype->tier_sibling, &memtier->memory_types);
return memtier;
}

@@ -499,7 +481,7 @@ static struct memory_tier *set_node_memory_tier(int node)
struct memory_tier *memtier;
struct memory_dev_type *memtype;
pg_data_t *pgdat = NODE_DATA(node);
-
+ int tier_adistance;

lockdep_assert_held_once(&memory_tier_lock);

@@ -510,9 +492,13 @@ static struct memory_tier *set_node_memory_tier(int node)

memtype = node_memory_types[node].memtype;
node_set(node, memtype->nodes);
- memtier = find_create_memory_tier(memtype);
+ tier_adistance = get_node_adistance_offset(node);
+ tier_adistance = memtype->adistance + tier_adistance;
+
+ memtier = find_create_memory_tier(memtype, tier_adistance);
if (!IS_ERR(memtier))
rcu_assign_pointer(pgdat->memtier, memtier);
+ node_set(node, memtier->nodes);
return memtier;
}

@@ -548,11 +534,9 @@ static bool clear_node_memory_tier(int node)
synchronize_rcu();
memtype = node_memory_types[node].memtype;
node_clear(node, memtype->nodes);
- if (nodes_empty(memtype->nodes)) {
- list_del_init(&memtype->tier_sibling);
- if (list_empty(&memtier->memory_types))
- destroy_memory_tier(memtier);
- }
+ node_clear(node, memtier->nodes);
+ if (nodes_empty(memtier->nodes))
+ destroy_memory_tier(memtier);
cleared = true;
}
return cleared;
@@ -575,7 +559,6 @@ struct memory_dev_type *alloc_memory_type(int adistance)
return ERR_PTR(-ENOMEM);

memtype->adistance = adistance;
- INIT_LIST_HEAD(&memtype->tier_sibling);
memtype->nodes = NODE_MASK_NONE;
kref_init(&memtype->kref);
return memtype;
@@ -615,6 +598,20 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype)
}
EXPORT_SYMBOL_GPL(clear_node_memory_type);

+void node_memtier_change(int node)
+{
+ struct memory_tier *memtier;
+
+ mutex_lock(&memory_tier_lock);
+ if (clear_node_memory_tier(node))
+ establish_demotion_targets();
+ memtier = set_node_memory_tier(node);
+ if (!IS_ERR(memtier))
+ establish_demotion_targets();
+ mutex_unlock(&memory_tier_lock);
+}
+
+
static void dump_hmem_attrs(struct node_hmem_attrs *attrs, const char *prefix)
{
pr_info(
--
2.25.1

2023-11-30 21:50:02

by Srinivasulu Opensrc

[permalink] [raw]
Subject: [RFC PATCH 1/2] base/node: Add sysfs for adistance_offset

From: Srinivasulu Thanneeru <[email protected]>

This patch introduces a new attribute called adistance_offset to the
node_devices structure.
Using adistance_offset, a node can be migrated to a targeted tier.
Target tier's adjacent distance(adistance) is calculated by taking the
adistance offset into account.

Signed-off-by: Srinivasulu Thanneeru <[email protected]>
Signed-off-by: Ravi Jonnalagadda <[email protected]>
---
drivers/base/node.c | 45 ++++++++++++++++++++++++++++++++++++
include/linux/memory-tiers.h | 6 +++++
include/linux/node.h | 1 +
mm/memory-tiers.c | 14 +++++++++++
4 files changed, 66 insertions(+)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 493d533f8375..1e63c692977b 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -7,6 +7,7 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/memory.h>
+#include <linux/memory-tiers.h>
#include <linux/vmstat.h>
#include <linux/notifier.h>
#include <linux/node.h>
@@ -569,11 +570,54 @@ static ssize_t node_read_distance(struct device *dev,
}
static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);

+static ssize_t adistance_offset_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int nid = dev->id;
+ int len = 0;
+
+ /*
+ * buf is currently PAGE_SIZE in length and each node needs 4 chars
+ * at the most (distance + space or newline).
+ */
+ BUILD_BUG_ON(MAX_NUMNODES * 4 > PAGE_SIZE);
+
+ len += sysfs_emit(buf, "%d\n", node_devices[nid]->adistance_offset);
+ return len;
+}
+
+static ssize_t adistance_offset_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ int nid = dev->id;
+ int value, ret;
+
+ ret = kstrtoint(buf, 0, &value);
+
+ if (ret)
+ return ret;
+ if (node_devices[nid]->adistance_offset == value)
+ return size;
+ /*
+ * Request from a node to migrate to a memtier with negative
+ * adistance is not valid.
+ */
+ ret = get_target_memtier_adistance(nid, value);
+ if (ret < 0)
+ return -EINVAL;
+
+ node_devices[nid]->adistance_offset = value;
+ return size;
+}
+static DEVICE_ATTR_RW(adistance_offset);
+
static struct attribute *node_dev_attrs[] = {
&dev_attr_meminfo.attr,
&dev_attr_numastat.attr,
&dev_attr_distance.attr,
&dev_attr_vmstat.attr,
+ &dev_attr_adistance_offset.attr,
NULL
};

@@ -883,6 +927,7 @@ int __register_one_node(int nid)

INIT_LIST_HEAD(&node_devices[nid]->access_list);
node_init_caches(nid);
+ node_devices[nid]->adistance_offset = 0;

return error;
}
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 1e39d27bee41..ff4e7136ab40 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -48,6 +48,7 @@ int mt_calc_adistance(int node, int *adist);
int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
const char *source);
int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist);
+int get_target_memtier_adistance(int node, int adistance_offset);
#ifdef CONFIG_MIGRATION
int next_demotion_node(int node);
void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -136,5 +137,10 @@ static inline int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist)
{
return -EIO;
}
+
+static int get_target_memtier_adistance(int node, int adistance_offset)
+{
+ return 0;
+}
#endif /* CONFIG_NUMA */
#endif /* _LINUX_MEMORY_TIERS_H */
diff --git a/include/linux/node.h b/include/linux/node.h
index 427a5975cf40..fd0f4f3177f8 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -83,6 +83,7 @@ static inline void node_set_perf_attrs(unsigned int nid,
struct node {
struct device dev;
struct list_head access_list;
+ int adistance_offset;
#ifdef CONFIG_HMEM_REPORTING
struct list_head cache_attrs;
struct device *cache_dev;
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 8d5291add2bc..a40d4d4383d7 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -167,6 +167,20 @@ static const struct attribute_group *memtier_dev_groups[] = {
NULL
};

+int get_target_memtier_adistance(int node, int adistance_offset)
+{
+ struct memory_dev_type *memtype;
+ int node_adistance;
+
+ memtype = node_memory_types[node].memtype;
+ /*
+ * Calculate the targeted memtier abstract distance from
+ * memtype adistance and node adistance offset.
+ */
+ node_adistance = memtype->adistance + adistance_offset;
+ return node_adistance;
+}
+
static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
{
int ret;
--
2.25.1

2023-12-04 08:54:40

by Huang, Ying

[permalink] [raw]
Subject: Re: [RFC PATCH 0/2] Node migration between memory tiers

<[email protected]> writes:

> From: Srinivasulu Thanneeru <[email protected]>
>
> The memory tiers feature allows nodes with similar memory types
> or performance characteristics to be grouped together in a
> memory tier. However, there is currently no provision for
> moving a node from one tier to another on demand.
>
> This patch series aims to support node migration between tiers
> on demand by sysadmin/root user using the provided sysfs for
> node migration. Each tier has a start abstract distance(adistance)
> and range.

We have discussed migrating nodes (in fact nodes of a memory type)
between tiers by sysadmin/root before. The basic idea comes from
Johannes. It is summarized in page 11 of [1],

[1] https://lpc.events/event/16/contributions/1209/attachments/1042/1995/Live%20In%20a%20World%20With%20Multiple%20Memory%20Types.pdf

The abstract distance of a memory type (e.g., GPU HBM) can be adjusted
via a sysfs knob (<memory_type>/abstract_distance_offset).

I still think that the memory type is better to be used to change
the abstract distance of nodes. Do you agree?

--
Best Regards,
Huang, Ying

> To migrate a node to a tier, the corresponding node’s sysfs
> adistance_offset is written with a value corresponding to
> the tier’s adistance.
>
> Example: Move node2 to memory tier5 from its default tier(i.e 4)
>
> 1. Check default values:
> $cat /sys/devices/virtual/memory_tiering/memory_tier4/nodelist
> 0-2
>
> $cat /sys/devices/system/node/node0/adistance_offset
> 0
> $cat /sys/devices/system/node/node1/adistance_offset
> 0
> $cat /sys/devices/system/node/node2/adistance_offset
> 0
>
> 2. Move node2 to tier5:
>
> To move node2 from emory_tier4 (adistance=512) to
> emory_tier5 (abstract=640), set the `adistance_offset` of
> node 2 to 128 (i.e., 512 + 128 = 640).
>
> Tier4 adistance start can be derved from tier-id
> (i.e for tier4, 4 << 7 = 512).
>
> $echo 128 > /sys/devices/system/node/node2/adistance_offset
> $cat /sys/devices/system/node/node2/adistance_offset
> 128
>
> 3. Verify node2's tier id:
>
> $cat /sys/devices/virtual/memory_tiering/memory_tier5/nodelist
> 2
> $cat /sys/devices/virtual/memory_tiering/memory_tier4/nodelist
> 0-1
>
> Srinivasulu Thanneeru (2):
> base/node: Add sysfs for adistance_offset
> memory tier: Support node migration between tiers
>
> drivers/base/node.c | 51 +++++++++++++++++++++++
> include/linux/memory-tiers.h | 11 +++++
> include/linux/node.h | 6 +++
> mm/memory-tiers.c | 79 ++++++++++++++++++++----------------
> 4 files changed, 113 insertions(+), 34 deletions(-)