2011-04-27 07:57:58

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: [PATCHv2] memcg: reclaim memory from node in round-robin

I changed the logic a little and add a filter for skipping nodes.
With large NUMA, tasks may under cpuset or mempolicy and the usage of memory
can be unbalanced. So, I think a filter is required.

==
Now, memory cgroup's direct reclaim frees memory from the current node.
But this has some troubles. In usual, when a set of threads works in
cooperative way, they are tend to on the same node. So, if they hit
limits under memcg, it will reclaim memory from themselves, it may be
active working set.

For example, assume 2 node system which has Node 0 and Node 1
and a memcg which has 1G limit. After some work, file cacne remains and
and usages are
Node 0: 1M
Node 1: 998M.

and run an application on Node 0, it will eats its foot before freeing
unnecessary file caches.

This patch adds round-robin for NUMA and adds equal pressure to each
node. When using cpuset's spread memory feature, this will work very well.


From: Ying Han <[email protected]>
Signed-off-by: Ying Han <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>

Changelog v1->v2:
- fixed comments.
- added a logic to avoid scanning unused node.

---
include/linux/memcontrol.h | 1
mm/memcontrol.c | 98 ++++++++++++++++++++++++++++++++++++++++++---
mm/vmscan.c | 9 +++-
3 files changed, 101 insertions(+), 7 deletions(-)

Index: memcg/include/linux/memcontrol.h
===================================================================
--- memcg.orig/include/linux/memcontrol.h
+++ memcg/include/linux/memcontrol.h
@@ -108,6 +108,7 @@ extern void mem_cgroup_end_migration(str
*/
int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
+int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
struct zone *zone,
enum lru_list lru);
Index: memcg/mm/memcontrol.c
===================================================================
--- memcg.orig/mm/memcontrol.c
+++ memcg/mm/memcontrol.c
@@ -237,6 +237,11 @@ struct mem_cgroup {
* reclaimed from.
*/
int last_scanned_child;
+ int last_scanned_node;
+#if MAX_NUMNODES > 1
+ nodemask_t scan_nodes;
+ unsigned long next_scan_node_update;
+#endif
/*
* Should the accounting and control be hierarchical, per subtree?
*/
@@ -650,18 +655,27 @@ static void mem_cgroup_soft_scan(struct
this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_SOFT_SCAN], val);
}

+static unsigned long
+mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
+{
+ struct mem_cgroup_per_zone *mz;
+ u64 total;
+ int zid;
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ mz = mem_cgroup_zoneinfo(mem, nid, zid);
+ total += MEM_CGROUP_ZSTAT(mz, idx);
+ }
+ return total;
+}
static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
enum lru_list idx)
{
- int nid, zid;
- struct mem_cgroup_per_zone *mz;
+ int nid;
u64 total = 0;

for_each_online_node(nid)
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- mz = mem_cgroup_zoneinfo(mem, nid, zid);
- total += MEM_CGROUP_ZSTAT(mz, idx);
- }
+ total += mem_cgroup_get_zonestat_node(mem, nid, idx);
return total;
}

@@ -1471,6 +1485,77 @@ mem_cgroup_select_victim(struct mem_cgro
return ret;
}

+#if MAX_NUMNODES > 1
+
+/*
+ * Update nodemask always is not very good. Even if we have empty
+ * list, or wrong list here, we can start from some node and traverse all nodes
+ * based on zonelist. So, update the list loosely once in 10 secs.
+ *
+ */
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
+{
+ int nid;
+
+ if (time_after(mem->next_scan_node_update, jiffies))
+ return;
+
+ mem->next_scan_node_update = jiffies + 10*HZ;
+ /* make a nodemask where this memcg uses memory from */
+ mem->scan_nodes = node_states[N_HIGH_MEMORY];
+
+ for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+
+ if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
+ mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
+ continue;
+
+ if (total_swap_pages &&
+ (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
+ mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
+ continue;
+ node_clear(nid, mem->scan_nodes);
+ }
+
+}
+
+/*
+ * Selecting a node where we start reclaim from. Because what we need is just
+ * reducing usage counter, start from anywhere is O,K. Considering
+ * memory reclaim from current node, there are pros. and cons.
+ *
+ * Freeing memory from current node means freeing memory from a node which
+ * we'll use or we've used. So, it may make LRU bad. And if several threads
+ * hit limits, it will see a contention on a node. But freeing from remote
+ * node means more costs for memory reclaim because of memory latency.
+ *
+ * Now, we use round-robin. Better algorithm is welcomed.
+ */
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+ int node;
+
+ mem_cgroup_may_update_nodemask(mem);
+ node = mem->last_scanned_node;
+
+ node = next_node(node, mem->scan_nodes);
+ if (node == MAX_NUMNODES) {
+ node = first_node(mem->scan_nodes);
+ if (unlikely(node == MAX_NUMNODES))
+ node = numa_node_id();
+ }
+
+ mem->last_scanned_node = node;
+ return node;
+}
+
+#else
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+ return 0;
+}
+#endif
+
/*
* Scan the hierarchy if needed to reclaim memory. We remember the last child
* we reclaimed from, so that we don't end up penalizing one child extensively
@@ -4678,6 +4763,7 @@ mem_cgroup_create(struct cgroup_subsys *
res_counter_init(&mem->memsw, NULL);
}
mem->last_scanned_child = 0;
+ mem->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&mem->oom_notify);

if (parent)
Index: memcg/mm/vmscan.c
===================================================================
--- memcg.orig/mm/vmscan.c
+++ memcg/mm/vmscan.c
@@ -2198,6 +2198,7 @@ unsigned long try_to_free_mem_cgroup_pag
{
struct zonelist *zonelist;
unsigned long nr_reclaimed;
+ int nid;
struct scan_control sc = {
.may_writepage = !laptop_mode,
.may_unmap = 1,
@@ -2208,10 +2209,16 @@ unsigned long try_to_free_mem_cgroup_pag
.mem_cgroup = mem_cont,
.nodemask = NULL, /* we don't care the placement */
};
+ /*
+ * Unlike direct reclaim via alloc_pages(), memcg's reclaim
+ * don't take care of from where we get pages . So, the node where
+ * we start scan is not needed to be current node.
+ */
+ nid = mem_cgroup_select_victim_node(mem_cont);

sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
- zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+ zonelist = NODE_DATA(nid)->node_zonelists;

trace_mm_vmscan_memcg_reclaim_begin(0,
sc.may_writepage,


2011-04-27 17:33:49

by Ying Han

[permalink] [raw]
Subject: Re: [PATCHv2] memcg: reclaim memory from node in round-robin

On Wed, Apr 27, 2011 at 12:51 AM, KAMEZAWA Hiroyuki
<[email protected]> wrote:
> I changed the logic a little and add a filter for skipping nodes.
> With large NUMA, tasks may under cpuset or mempolicy and the usage of memory
> can be unbalanced. So, I think a filter is required.

Thank you.

>
> ==
> Now, memory cgroup's direct reclaim frees memory from the current node.
> But this has some troubles. In usual, when a set of threads works in
> cooperative way, they are tend to on the same node. So, if they hit
> limits under memcg, it will reclaim memory from themselves, it may be
> active working set.
>
> For example, assume 2 node system which has Node 0 and Node 1
> and a memcg which has 1G limit. After some work, file cacne remains and
> and usages are
> ? Node 0: ?1M
> ? Node 1: ?998M.
>
> and run an application on Node 0, it will eats its foot before freeing
> unnecessary file caches.
>
> This patch adds round-robin for NUMA and adds equal pressure to each
> node. When using cpuset's spread memory feature, this will work very well.
>
>
> From: Ying Han <[email protected]>
> Signed-off-by: Ying Han <[email protected]>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
>
> Changelog v1->v2:
> ?- fixed comments.
> ?- added a logic to avoid scanning unused node.
>
> ---
> ?include/linux/memcontrol.h | ? ?1
> ?mm/memcontrol.c ? ? ? ? ? ?| ? 98 ++++++++++++++++++++++++++++++++++++++++++---
> ?mm/vmscan.c ? ? ? ? ? ? ? ?| ? ?9 +++-
> ?3 files changed, 101 insertions(+), 7 deletions(-)
>
> Index: memcg/include/linux/memcontrol.h
> ===================================================================
> --- memcg.orig/include/linux/memcontrol.h
> +++ memcg/include/linux/memcontrol.h
> @@ -108,6 +108,7 @@ extern void mem_cgroup_end_migration(str
> ?*/
> ?int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
> ?int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
> +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
> ?unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct zone *zone,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? enum lru_list lru);
> Index: memcg/mm/memcontrol.c
> ===================================================================
> --- memcg.orig/mm/memcontrol.c
> +++ memcg/mm/memcontrol.c
> @@ -237,6 +237,11 @@ struct mem_cgroup {
> ? ? ? ? * reclaimed from.
> ? ? ? ? */
> ? ? ? ?int last_scanned_child;
> + ? ? ? int last_scanned_node;
> +#if MAX_NUMNODES > 1
> + ? ? ? nodemask_t ? ? ?scan_nodes;
> + ? ? ? unsigned long ? next_scan_node_update;
> +#endif
> ? ? ? ?/*
> ? ? ? ? * Should the accounting and control be hierarchical, per subtree?
> ? ? ? ? */
> @@ -650,18 +655,27 @@ static void mem_cgroup_soft_scan(struct
> ? ? ? ?this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_SOFT_SCAN], val);
> ?}
>
> +static unsigned long
> +mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
> +{
> + ? ? ? struct mem_cgroup_per_zone *mz;
> + ? ? ? u64 total;
> + ? ? ? int zid;
> +
> + ? ? ? for (zid = 0; zid < MAX_NR_ZONES; zid++) {
> + ? ? ? ? ? ? ? mz = mem_cgroup_zoneinfo(mem, nid, zid);
> + ? ? ? ? ? ? ? total += MEM_CGROUP_ZSTAT(mz, idx);
> + ? ? ? }
> + ? ? ? return total;
> +}
> ?static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?enum lru_list idx)
> ?{
> - ? ? ? int nid, zid;
> - ? ? ? struct mem_cgroup_per_zone *mz;
> + ? ? ? int nid;
> ? ? ? ?u64 total = 0;
>
> ? ? ? ?for_each_online_node(nid)
> - ? ? ? ? ? ? ? for (zid = 0; zid < MAX_NR_ZONES; zid++) {
> - ? ? ? ? ? ? ? ? ? ? ? mz = mem_cgroup_zoneinfo(mem, nid, zid);
> - ? ? ? ? ? ? ? ? ? ? ? total += MEM_CGROUP_ZSTAT(mz, idx);
> - ? ? ? ? ? ? ? }
> + ? ? ? ? ? ? ? total += mem_cgroup_get_zonestat_node(mem, nid, idx);
> ? ? ? ?return total;
> ?}
>
> @@ -1471,6 +1485,77 @@ mem_cgroup_select_victim(struct mem_cgro
> ? ? ? ?return ret;
> ?}
>
> +#if MAX_NUMNODES > 1
> +
> +/*
> + * Update nodemask always is not very good. Even if we have empty
> + * list, or wrong list here, we can start from some node and traverse all nodes
> + * based on zonelist. So, update the list loosely once in 10 secs.
> + *
> + */
> +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
> +{
> + ? ? ? int nid;
> +
> + ? ? ? if (time_after(mem->next_scan_node_update, jiffies))
> + ? ? ? ? ? ? ? return;
> +
> + ? ? ? mem->next_scan_node_update = jiffies + 10*HZ;
> + ? ? ? /* make a nodemask where this memcg uses memory from */
> + ? ? ? mem->scan_nodes = node_states[N_HIGH_MEMORY];
> +
> + ? ? ? for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
> +
> + ? ? ? ? ? ? ? if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
> + ? ? ? ? ? ? ? ? ? mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
> + ? ? ? ? ? ? ? ? ? ? ? continue;
> +
> + ? ? ? ? ? ? ? if (total_swap_pages &&
> + ? ? ? ? ? ? ? ? ? (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
> + ? ? ? ? ? ? ? ? ? ?mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
> + ? ? ? ? ? ? ? ? ? ? ? continue;
> + ? ? ? ? ? ? ? node_clear(nid, mem->scan_nodes);
> + ? ? ? }
> +
> +}
> +
> +/*
> + * Selecting a node where we start reclaim from. Because what we need is just
> + * reducing usage counter, start from anywhere is O,K. Considering
> + * memory reclaim from current node, there are pros. and cons.
> + *
> + * Freeing memory from current node means freeing memory from a node which
> + * we'll use or we've used. So, it may make LRU bad. And if several threads
> + * hit limits, it will see a contention on a node. But freeing from remote
> + * node means more costs for memory reclaim because of memory latency.
> + *
> + * Now, we use round-robin. Better algorithm is welcomed.
> + */
> +int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
> +{
> + ? ? ? int node;
> +
> + ? ? ? mem_cgroup_may_update_nodemask(mem);
> + ? ? ? node = mem->last_scanned_node;
> +
> + ? ? ? node = next_node(node, mem->scan_nodes);
> + ? ? ? if (node == MAX_NUMNODES) {
> + ? ? ? ? ? ? ? node = first_node(mem->scan_nodes);
> + ? ? ? ? ? ? ? if (unlikely(node == MAX_NUMNODES))
> + ? ? ? ? ? ? ? ? ? ? ? node = numa_node_id();
not sure about this logic, is that possible we reclaim from a node
with all "unreclaimable" pages (based on the
mem_cgroup_may_update_nodemask check).
If i missed anything here, it would be helpful to add comment.

--Ying

> + ? ? ? }
> +
> + ? ? ? mem->last_scanned_node = node;
> + ? ? ? return node;
> +}
> +
> +#else
> +int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
> +{
> + ? ? ? return 0;
> +}
> +#endif
> +
> ?/*
> ?* Scan the hierarchy if needed to reclaim memory. We remember the last child
> ?* we reclaimed from, so that we don't end up penalizing one child extensively
> @@ -4678,6 +4763,7 @@ mem_cgroup_create(struct cgroup_subsys *
> ? ? ? ? ? ? ? ?res_counter_init(&mem->memsw, NULL);
> ? ? ? ?}
> ? ? ? ?mem->last_scanned_child = 0;
> + ? ? ? mem->last_scanned_node = MAX_NUMNODES;
> ? ? ? ?INIT_LIST_HEAD(&mem->oom_notify);
>
> ? ? ? ?if (parent)
> Index: memcg/mm/vmscan.c
> ===================================================================
> --- memcg.orig/mm/vmscan.c
> +++ memcg/mm/vmscan.c
> @@ -2198,6 +2198,7 @@ unsigned long try_to_free_mem_cgroup_pag
> ?{
> ? ? ? ?struct zonelist *zonelist;
> ? ? ? ?unsigned long nr_reclaimed;
> + ? ? ? int nid;
> ? ? ? ?struct scan_control sc = {
> ? ? ? ? ? ? ? ?.may_writepage = !laptop_mode,
> ? ? ? ? ? ? ? ?.may_unmap = 1,
> @@ -2208,10 +2209,16 @@ unsigned long try_to_free_mem_cgroup_pag
> ? ? ? ? ? ? ? ?.mem_cgroup = mem_cont,
> ? ? ? ? ? ? ? ?.nodemask = NULL, /* we don't care the placement */
> ? ? ? ?};
> + ? ? ? /*
> + ? ? ? ?* Unlike direct reclaim via alloc_pages(), memcg's reclaim
> + ? ? ? ?* don't take care of from where we get pages . So, the node where
> + ? ? ? ?* we start scan is not needed to be current node.
> + ? ? ? ?*/
> + ? ? ? nid = mem_cgroup_select_victim_node(mem_cont);
>
> ? ? ? ?sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
> ? ? ? ? ? ? ? ? ? ? ? ?(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
> - ? ? ? zonelist = NODE_DATA(numa_node_id())->node_zonelists;
> + ? ? ? zonelist = NODE_DATA(nid)->node_zonelists;
>
> ? ? ? ?trace_mm_vmscan_memcg_reclaim_begin(0,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?sc.may_writepage,
>
>

2011-04-28 00:05:14

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCHv2] memcg: reclaim memory from node in round-robin

On Wed, 27 Apr 2011 10:33:43 -0700
Ying Han <[email protected]> wrote:

> On Wed, Apr 27, 2011 at 12:51 AM, KAMEZAWA Hiroyuki
> <[email protected]> wrote:
> > I changed the logic a little and add a filter for skipping nodes.
> > With large NUMA, tasks may under cpuset or mempolicy and the usage of memory
> > can be unbalanced. So, I think a filter is required.
>
> Thank you.
>
> >
> > ==
> > Now, memory cgroup's direct reclaim frees memory from the current node.
> > But this has some troubles. In usual, when a set of threads works in
> > cooperative way, they are tend to on the same node. So, if they hit
> > limits under memcg, it will reclaim memory from themselves, it may be
> > active working set.
> >
> > For example, assume 2 node system which has Node 0 and Node 1
> > and a memcg which has 1G limit. After some work, file cacne remains and
> > and usages are
> >   Node 0:  1M
> >   Node 1:  998M.
> >
> > and run an application on Node 0, it will eats its foot before freeing
> > unnecessary file caches.
> >
> > This patch adds round-robin for NUMA and adds equal pressure to each
> > node. When using cpuset's spread memory feature, this will work very well.
> >
> >
> > From: Ying Han <[email protected]>
> > Signed-off-by: Ying Han <[email protected]>
> > Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> >
> > Changelog v1->v2:
> >  - fixed comments.
> >  - added a logic to avoid scanning unused node.
> >
> > ---
> >  include/linux/memcontrol.h |    1
> >  mm/memcontrol.c            |   98 ++++++++++++++++++++++++++++++++++++++++++---
> >  mm/vmscan.c                |    9 +++-
> >  3 files changed, 101 insertions(+), 7 deletions(-)
> >
> > Index: memcg/include/linux/memcontrol.h
> > ===================================================================
> > --- memcg.orig/include/linux/memcontrol.h
> > +++ memcg/include/linux/memcontrol.h
> > @@ -108,6 +108,7 @@ extern void mem_cgroup_end_migration(str
> >  */
> >  int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
> >  int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
> > +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
> >  unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
> >                                       struct zone *zone,
> >                                       enum lru_list lru);
> > Index: memcg/mm/memcontrol.c
> > ===================================================================
> > --- memcg.orig/mm/memcontrol.c
> > +++ memcg/mm/memcontrol.c
> > @@ -237,6 +237,11 @@ struct mem_cgroup {
> >         * reclaimed from.
> >         */
> >        int last_scanned_child;
> > +       int last_scanned_node;
> > +#if MAX_NUMNODES > 1
> > +       nodemask_t      scan_nodes;
> > +       unsigned long   next_scan_node_update;
> > +#endif
> >        /*
> >         * Should the accounting and control be hierarchical, per subtree?
> >         */
> > @@ -650,18 +655,27 @@ static void mem_cgroup_soft_scan(struct
> >        this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_SOFT_SCAN], val);
> >  }
> >
> > +static unsigned long
> > +mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
> > +{
> > +       struct mem_cgroup_per_zone *mz;
> > +       u64 total;
> > +       int zid;
> > +
> > +       for (zid = 0; zid < MAX_NR_ZONES; zid++) {
> > +               mz = mem_cgroup_zoneinfo(mem, nid, zid);
> > +               total += MEM_CGROUP_ZSTAT(mz, idx);
> > +       }
> > +       return total;
> > +}
> >  static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
> >                                        enum lru_list idx)
> >  {
> > -       int nid, zid;
> > -       struct mem_cgroup_per_zone *mz;
> > +       int nid;
> >        u64 total = 0;
> >
> >        for_each_online_node(nid)
> > -               for (zid = 0; zid < MAX_NR_ZONES; zid++) {
> > -                       mz = mem_cgroup_zoneinfo(mem, nid, zid);
> > -                       total += MEM_CGROUP_ZSTAT(mz, idx);
> > -               }
> > +               total += mem_cgroup_get_zonestat_node(mem, nid, idx);
> >        return total;
> >  }
> >
> > @@ -1471,6 +1485,77 @@ mem_cgroup_select_victim(struct mem_cgro
> >        return ret;
> >  }
> >
> > +#if MAX_NUMNODES > 1
> > +
> > +/*
> > + * Update nodemask always is not very good. Even if we have empty
> > + * list, or wrong list here, we can start from some node and traverse all nodes
> > + * based on zonelist. So, update the list loosely once in 10 secs.
> > + *
> > + */
> > +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
> > +{
> > +       int nid;
> > +
> > +       if (time_after(mem->next_scan_node_update, jiffies))
> > +               return;
> > +
> > +       mem->next_scan_node_update = jiffies + 10*HZ;
> > +       /* make a nodemask where this memcg uses memory from */
> > +       mem->scan_nodes = node_states[N_HIGH_MEMORY];
> > +
> > +       for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
> > +
> > +               if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
> > +                   mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
> > +                       continue;
> > +
> > +               if (total_swap_pages &&
> > +                   (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
> > +                    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
> > +                       continue;
> > +               node_clear(nid, mem->scan_nodes);
> > +       }
> > +
> > +}
> > +
> > +/*
> > + * Selecting a node where we start reclaim from. Because what we need is just
> > + * reducing usage counter, start from anywhere is O,K. Considering
> > + * memory reclaim from current node, there are pros. and cons.
> > + *
> > + * Freeing memory from current node means freeing memory from a node which
> > + * we'll use or we've used. So, it may make LRU bad. And if several threads
> > + * hit limits, it will see a contention on a node. But freeing from remote
> > + * node means more costs for memory reclaim because of memory latency.
> > + *
> > + * Now, we use round-robin. Better algorithm is welcomed.
> > + */
> > +int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
> > +{
> > +       int node;
> > +
> > +       mem_cgroup_may_update_nodemask(mem);
> > +       node = mem->last_scanned_node;
> > +
> > +       node = next_node(node, mem->scan_nodes);
> > +       if (node == MAX_NUMNODES) {
> > +               node = first_node(mem->scan_nodes);
> > +               if (unlikely(node == MAX_NUMNODES))
> > +                       node = numa_node_id();
> not sure about this logic, is that possible we reclaim from a node
> with all "unreclaimable" pages (based on the
> mem_cgroup_may_update_nodemask check).
> If i missed anything here, it would be helpful to add comment.
>

What I'm afraid here is when a user uses very small memcg,
all pages on the LRU may be isolated or all usages are in per-cpu cache
of memcg or because of task-migration between memcg, it hits limit before
having any pages on LRU.....I think there is possible corner cases which
can cause hang.

ok, will add comment.

Thanks,
-Kame





2011-04-28 00:41:52

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: [PATCHv3] memcg: reclaim memory from node in round-robin

Now, memory cgroup's direct reclaim frees memory from the current node.
But this has some troubles. In usual, when a set of threads works in
cooperative way, they are tend to on the same node. So, if they hit
limits under memcg, it will reclaim memory from themselves, it may be
active working set.

For example, assume 2 node system which has Node 0 and Node 1
and a memcg which has 1G limit. After some work, file cacne remains and
and usages are
Node 0: 1M
Node 1: 998M.

and run an application on Node 0, it will eats its foot before freeing
unnecessary file caches.

This patch adds round-robin for NUMA and adds equal pressure to each
node. With using cpuset's spread memory feature, this will work very well.

But yes, better algorithm is appreciated.

From: Ying Han <[email protected]>
Signed-off-by: Ying Han <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>

Changelog v2->v3
- added comments for why we need sanity check.

Changelog v1->v2:
- fixed comments.
- added a logic to avoid scanning unused node.

---
include/linux/memcontrol.h | 1
mm/memcontrol.c | 102 ++++++++++++++++++++++++++++++++++++++++++---
mm/vmscan.c | 9 +++
3 files changed, 105 insertions(+), 7 deletions(-)

Index: memcg/include/linux/memcontrol.h
===================================================================
--- memcg.orig/include/linux/memcontrol.h
+++ memcg/include/linux/memcontrol.h
@@ -108,6 +108,7 @@ extern void mem_cgroup_end_migration(str
*/
int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
+int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
struct zone *zone,
enum lru_list lru);
Index: memcg/mm/memcontrol.c
===================================================================
--- memcg.orig/mm/memcontrol.c
+++ memcg/mm/memcontrol.c
@@ -237,6 +237,11 @@ struct mem_cgroup {
* reclaimed from.
*/
int last_scanned_child;
+ int last_scanned_node;
+#if MAX_NUMNODES > 1
+ nodemask_t scan_nodes;
+ unsigned long next_scan_node_update;
+#endif
/*
* Should the accounting and control be hierarchical, per subtree?
*/
@@ -650,18 +655,27 @@ static void mem_cgroup_soft_scan(struct
this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_SOFT_SCAN], val);
}

+static unsigned long
+mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
+{
+ struct mem_cgroup_per_zone *mz;
+ u64 total;
+ int zid;
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ mz = mem_cgroup_zoneinfo(mem, nid, zid);
+ total += MEM_CGROUP_ZSTAT(mz, idx);
+ }
+ return total;
+}
static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
enum lru_list idx)
{
- int nid, zid;
- struct mem_cgroup_per_zone *mz;
+ int nid;
u64 total = 0;

for_each_online_node(nid)
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- mz = mem_cgroup_zoneinfo(mem, nid, zid);
- total += MEM_CGROUP_ZSTAT(mz, idx);
- }
+ total += mem_cgroup_get_zonestat_node(mem, nid, idx);
return total;
}

@@ -1471,6 +1485,81 @@ mem_cgroup_select_victim(struct mem_cgro
return ret;
}

+#if MAX_NUMNODES > 1
+
+/*
+ * Update nodemask always is not very good. Even if we have empty
+ * list, or wrong list here, we can start from some node and traverse all nodes
+ * based on zonelist. So, update the list loosely once in 10 secs.
+ *
+ */
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
+{
+ int nid;
+
+ if (time_after(mem->next_scan_node_update, jiffies))
+ return;
+
+ mem->next_scan_node_update = jiffies + 10*HZ;
+ /* make a nodemask where this memcg uses memory from */
+ mem->scan_nodes = node_states[N_HIGH_MEMORY];
+
+ for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+
+ if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
+ mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
+ continue;
+
+ if (total_swap_pages &&
+ (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
+ mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
+ continue;
+ node_clear(nid, mem->scan_nodes);
+ }
+}
+
+/*
+ * Selecting a node where we start reclaim from. Because what we need is just
+ * reducing usage counter, start from anywhere is O,K. Considering
+ * memory reclaim from current node, there are pros. and cons.
+ *
+ * Freeing memory from current node means freeing memory from a node which
+ * we'll use or we've used. So, it may make LRU bad. And if several threads
+ * hit limits, it will see a contention on a node. But freeing from remote
+ * node means more costs for memory reclaim because of memory latency.
+ *
+ * Now, we use round-robin. Better algorithm is welcomed.
+ */
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+ int node;
+
+ mem_cgroup_may_update_nodemask(mem);
+ node = mem->last_scanned_node;
+
+ node = next_node(node, mem->scan_nodes);
+ if (node == MAX_NUMNODES)
+ node = first_node(mem->scan_nodes);
+ /*
+ * We call this when we hit limit, not when pages are added to LRU.
+ * No LRU may hold pages because all pages are UNEVICTABLE or
+ * memcg is too small and all pages are not on LRU. In that case,
+ * we use curret node.
+ */
+ if (unlikely(node == MAX_NUMNODES))
+ node = numa_node_id();
+
+ mem->last_scanned_node = node;
+ return node;
+}
+
+#else
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+ return 0;
+}
+#endif
+
/*
* Scan the hierarchy if needed to reclaim memory. We remember the last child
* we reclaimed from, so that we don't end up penalizing one child extensively
@@ -4678,6 +4767,7 @@ mem_cgroup_create(struct cgroup_subsys *
res_counter_init(&mem->memsw, NULL);
}
mem->last_scanned_child = 0;
+ mem->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&mem->oom_notify);

if (parent)
Index: memcg/mm/vmscan.c
===================================================================
--- memcg.orig/mm/vmscan.c
+++ memcg/mm/vmscan.c
@@ -2198,6 +2198,7 @@ unsigned long try_to_free_mem_cgroup_pag
{
struct zonelist *zonelist;
unsigned long nr_reclaimed;
+ int nid;
struct scan_control sc = {
.may_writepage = !laptop_mode,
.may_unmap = 1,
@@ -2208,10 +2209,16 @@ unsigned long try_to_free_mem_cgroup_pag
.mem_cgroup = mem_cont,
.nodemask = NULL, /* we don't care the placement */
};
+ /*
+ * Unlike direct reclaim via alloc_pages(), memcg's reclaim
+ * don't take care of from where we get pages . So, the node where
+ * we start scan is not needed to be current node.
+ */
+ nid = mem_cgroup_select_victim_node(mem_cont);

sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
- zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+ zonelist = NODE_DATA(nid)->node_zonelists;

trace_mm_vmscan_memcg_reclaim_begin(0,
sc.may_writepage,

2011-04-28 01:49:54

by Daisuke Nishimura

[permalink] [raw]
Subject: Re: [PATCHv3] memcg: reclaim memory from node in round-robin

On Thu, 28 Apr 2011 09:35:13 +0900
KAMEZAWA Hiroyuki <[email protected]> wrote:

> Now, memory cgroup's direct reclaim frees memory from the current node.
> But this has some troubles. In usual, when a set of threads works in
> cooperative way, they are tend to on the same node. So, if they hit
> limits under memcg, it will reclaim memory from themselves, it may be
> active working set.
>
> For example, assume 2 node system which has Node 0 and Node 1
> and a memcg which has 1G limit. After some work, file cacne remains and
^^^^^
cache
> and usages are
> Node 0: 1M
> Node 1: 998M.
>
> and run an application on Node 0, it will eats its foot before freeing
> unnecessary file caches.
>
> This patch adds round-robin for NUMA and adds equal pressure to each
> node. With using cpuset's spread memory feature, this will work very well.
>
> But yes, better algorithm is appreciated.
>
> From: Ying Han <[email protected]>
> Signed-off-by: Ying Han <[email protected]>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
>
> Changelog v2->v3
> - added comments for why we need sanity check.
>
> Changelog v1->v2:
> - fixed comments.
> - added a logic to avoid scanning unused node.
>
> ---
> include/linux/memcontrol.h | 1
> mm/memcontrol.c | 102 ++++++++++++++++++++++++++++++++++++++++++---
> mm/vmscan.c | 9 +++
> 3 files changed, 105 insertions(+), 7 deletions(-)
>
> Index: memcg/include/linux/memcontrol.h
> ===================================================================
> --- memcg.orig/include/linux/memcontrol.h
> +++ memcg/include/linux/memcontrol.h
> @@ -108,6 +108,7 @@ extern void mem_cgroup_end_migration(str
> */
> int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
> int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
> +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
> unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
> struct zone *zone,
> enum lru_list lru);
> Index: memcg/mm/memcontrol.c
> ===================================================================
> --- memcg.orig/mm/memcontrol.c
> +++ memcg/mm/memcontrol.c
> @@ -237,6 +237,11 @@ struct mem_cgroup {
> * reclaimed from.
> */
> int last_scanned_child;
> + int last_scanned_node;
> +#if MAX_NUMNODES > 1
> + nodemask_t scan_nodes;
> + unsigned long next_scan_node_update;
> +#endif
> /*
> * Should the accounting and control be hierarchical, per subtree?
> */
> @@ -650,18 +655,27 @@ static void mem_cgroup_soft_scan(struct
> this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_SOFT_SCAN], val);
> }
>
> +static unsigned long
> +mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
> +{
> + struct mem_cgroup_per_zone *mz;
> + u64 total;
> + int zid;
> +
> + for (zid = 0; zid < MAX_NR_ZONES; zid++) {
> + mz = mem_cgroup_zoneinfo(mem, nid, zid);
> + total += MEM_CGROUP_ZSTAT(mz, idx);
> + }
> + return total;
> +}
> static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
> enum lru_list idx)
> {
> - int nid, zid;
> - struct mem_cgroup_per_zone *mz;
> + int nid;
> u64 total = 0;
>
> for_each_online_node(nid)
> - for (zid = 0; zid < MAX_NR_ZONES; zid++) {
> - mz = mem_cgroup_zoneinfo(mem, nid, zid);
> - total += MEM_CGROUP_ZSTAT(mz, idx);
> - }
> + total += mem_cgroup_get_zonestat_node(mem, nid, idx);
> return total;
> }
>
> @@ -1471,6 +1485,81 @@ mem_cgroup_select_victim(struct mem_cgro
> return ret;
> }
>
> +#if MAX_NUMNODES > 1
> +
> +/*
> + * Update nodemask always is not very good. Even if we have empty
> + * list, or wrong list here, we can start from some node and traverse all nodes
> + * based on zonelist. So, update the list loosely once in 10 secs.
> + *
> + */
> +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
> +{
> + int nid;
> +
> + if (time_after(mem->next_scan_node_update, jiffies))
> + return;
> +
Shouldn't it be time_before() or time_after(jiffies, next_scan_node_update) ?

Looks good to me, otherwise.

Thanks,
Daisuke Nishimura.

> + mem->next_scan_node_update = jiffies + 10*HZ;
> + /* make a nodemask where this memcg uses memory from */
> + mem->scan_nodes = node_states[N_HIGH_MEMORY];
> +
> + for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
> +
> + if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
> + mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
> + continue;
> +
> + if (total_swap_pages &&
> + (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
> + mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
> + continue;
> + node_clear(nid, mem->scan_nodes);
> + }
> +}
> +
> +/*
> + * Selecting a node where we start reclaim from. Because what we need is just
> + * reducing usage counter, start from anywhere is O,K. Considering
> + * memory reclaim from current node, there are pros. and cons.
> + *
> + * Freeing memory from current node means freeing memory from a node which
> + * we'll use or we've used. So, it may make LRU bad. And if several threads
> + * hit limits, it will see a contention on a node. But freeing from remote
> + * node means more costs for memory reclaim because of memory latency.
> + *
> + * Now, we use round-robin. Better algorithm is welcomed.
> + */
> +int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
> +{
> + int node;
> +
> + mem_cgroup_may_update_nodemask(mem);
> + node = mem->last_scanned_node;
> +
> + node = next_node(node, mem->scan_nodes);
> + if (node == MAX_NUMNODES)
> + node = first_node(mem->scan_nodes);
> + /*
> + * We call this when we hit limit, not when pages are added to LRU.
> + * No LRU may hold pages because all pages are UNEVICTABLE or
> + * memcg is too small and all pages are not on LRU. In that case,
> + * we use curret node.
> + */
> + if (unlikely(node == MAX_NUMNODES))
> + node = numa_node_id();
> +
> + mem->last_scanned_node = node;
> + return node;
> +}
> +
> +#else
> +int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
> +{
> + return 0;
> +}
> +#endif
> +
> /*
> * Scan the hierarchy if needed to reclaim memory. We remember the last child
> * we reclaimed from, so that we don't end up penalizing one child extensively
> @@ -4678,6 +4767,7 @@ mem_cgroup_create(struct cgroup_subsys *
> res_counter_init(&mem->memsw, NULL);
> }
> mem->last_scanned_child = 0;
> + mem->last_scanned_node = MAX_NUMNODES;
> INIT_LIST_HEAD(&mem->oom_notify);
>
> if (parent)
> Index: memcg/mm/vmscan.c
> ===================================================================
> --- memcg.orig/mm/vmscan.c
> +++ memcg/mm/vmscan.c
> @@ -2198,6 +2198,7 @@ unsigned long try_to_free_mem_cgroup_pag
> {
> struct zonelist *zonelist;
> unsigned long nr_reclaimed;
> + int nid;
> struct scan_control sc = {
> .may_writepage = !laptop_mode,
> .may_unmap = 1,
> @@ -2208,10 +2209,16 @@ unsigned long try_to_free_mem_cgroup_pag
> .mem_cgroup = mem_cont,
> .nodemask = NULL, /* we don't care the placement */
> };
> + /*
> + * Unlike direct reclaim via alloc_pages(), memcg's reclaim
> + * don't take care of from where we get pages . So, the node where
> + * we start scan is not needed to be current node.
> + */
> + nid = mem_cgroup_select_victim_node(mem_cont);
>
> sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
> (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
> - zonelist = NODE_DATA(numa_node_id())->node_zonelists;
> + zonelist = NODE_DATA(nid)->node_zonelists;
>
> trace_mm_vmscan_memcg_reclaim_begin(0,
> sc.may_writepage,
>

2011-04-28 01:55:58

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: [PATCHv4] memcg: reclaim memory from node in round-robin

On Thu, 28 Apr 2011 10:37:05 +0900
Daisuke Nishimura <[email protected]> wrote:
> > + if (time_after(mem->next_scan_node_update, jiffies))
> > + return;
> > +
> Shouldn't it be time_before() or time_after(jiffies, next_scan_node_update) ?
>
> Looks good to me, otherwise.
>

time_after(a, b) returns true when a is after b.....you're right.
==
Now, memory cgroup's direct reclaim frees memory from the current node.
But this has some troubles. In usual, when a set of threads works in
cooperative way, they are tend to on the same node. So, if they hit
limits under memcg, it will reclaim memory from themselves, it may be
active working set.

For example, assume 2 node system which has Node 0 and Node 1
and a memcg which has 1G limit. After some work, file cacne remains and
and usages are
Node 0: 1M
Node 1: 998M.

and run an application on Node 0, it will eats its foot before freeing
unnecessary file caches.

This patch adds round-robin for NUMA and adds equal pressure to each
node. When using cpuset's spread memory feature, this will work very well.

But yes, better algorithm is appreciated.

From: Ying Han <[email protected]>
Signed-off-by: Ying Han <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>

Changelog v3->v4
- fixed time_after() bug. using time_before() instead.
Changelog v2->v3
- added comments.

Changelog v1->v2:
- fixed comments.
- added a logic to avoid scanning unused node.

---
include/linux/memcontrol.h | 1
mm/memcontrol.c | 102 ++++++++++++++++++++++++++++++++++++++++++---
mm/vmscan.c | 9 +++
3 files changed, 105 insertions(+), 7 deletions(-)

Index: memcg/include/linux/memcontrol.h
===================================================================
--- memcg.orig/include/linux/memcontrol.h
+++ memcg/include/linux/memcontrol.h
@@ -108,6 +108,7 @@ extern void mem_cgroup_end_migration(str
*/
int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
+int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
struct zone *zone,
enum lru_list lru);
Index: memcg/mm/memcontrol.c
===================================================================
--- memcg.orig/mm/memcontrol.c
+++ memcg/mm/memcontrol.c
@@ -237,6 +237,11 @@ struct mem_cgroup {
* reclaimed from.
*/
int last_scanned_child;
+ int last_scanned_node;
+#if MAX_NUMNODES > 1
+ nodemask_t scan_nodes;
+ unsigned long next_scan_node_update;
+#endif
/*
* Should the accounting and control be hierarchical, per subtree?
*/
@@ -650,18 +655,27 @@ static void mem_cgroup_soft_scan(struct
this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_SOFT_SCAN], val);
}

+static unsigned long
+mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
+{
+ struct mem_cgroup_per_zone *mz;
+ u64 total;
+ int zid;
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ mz = mem_cgroup_zoneinfo(mem, nid, zid);
+ total += MEM_CGROUP_ZSTAT(mz, idx);
+ }
+ return total;
+}
static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
enum lru_list idx)
{
- int nid, zid;
- struct mem_cgroup_per_zone *mz;
+ int nid;
u64 total = 0;

for_each_online_node(nid)
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- mz = mem_cgroup_zoneinfo(mem, nid, zid);
- total += MEM_CGROUP_ZSTAT(mz, idx);
- }
+ total += mem_cgroup_get_zonestat_node(mem, nid, idx);
return total;
}

@@ -1471,6 +1485,81 @@ mem_cgroup_select_victim(struct mem_cgro
return ret;
}

+#if MAX_NUMNODES > 1
+
+/*
+ * Update nodemask always is not very good. Even if we have empty
+ * list, or wrong list here, we can start from some node and traverse all nodes
+ * based on zonelist. So, update the list loosely once in 10 secs.
+ *
+ */
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
+{
+ int nid;
+
+ if (time_before(mem->next_scan_node_update, jiffies))
+ return;
+
+ mem->next_scan_node_update = jiffies + 10*HZ;
+ /* make a nodemask where this memcg uses memory from */
+ mem->scan_nodes = node_states[N_HIGH_MEMORY];
+
+ for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+
+ if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
+ mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
+ continue;
+
+ if (total_swap_pages &&
+ (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
+ mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
+ continue;
+ node_clear(nid, mem->scan_nodes);
+ }
+}
+
+/*
+ * Selecting a node where we start reclaim from. Because what we need is just
+ * reducing usage counter, start from anywhere is O,K. Considering
+ * memory reclaim from current node, there are pros. and cons.
+ *
+ * Freeing memory from current node means freeing memory from a node which
+ * we'll use or we've used. So, it may make LRU bad. And if several threads
+ * hit limits, it will see a contention on a node. But freeing from remote
+ * node means more costs for memory reclaim because of memory latency.
+ *
+ * Now, we use round-robin. Better algorithm is welcomed.
+ */
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+ int node;
+
+ mem_cgroup_may_update_nodemask(mem);
+ node = mem->last_scanned_node;
+
+ node = next_node(node, mem->scan_nodes);
+ if (node == MAX_NUMNODES)
+ node = first_node(mem->scan_nodes);
+ /*
+ * We call this when we hit limit, not when pages are added to LRU.
+ * No LRU may hold pages because all pages are UNEVICTABLE or
+ * memcg is too small and all pages are not on LRU. In that case,
+ * we use curret node.
+ */
+ if (unlikely(node == MAX_NUMNODES))
+ node = numa_node_id();
+
+ mem->last_scanned_node = node;
+ return node;
+}
+
+#else
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+ return 0;
+}
+#endif
+
/*
* Scan the hierarchy if needed to reclaim memory. We remember the last child
* we reclaimed from, so that we don't end up penalizing one child extensively
@@ -4678,6 +4767,7 @@ mem_cgroup_create(struct cgroup_subsys *
res_counter_init(&mem->memsw, NULL);
}
mem->last_scanned_child = 0;
+ mem->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&mem->oom_notify);

if (parent)
Index: memcg/mm/vmscan.c
===================================================================
--- memcg.orig/mm/vmscan.c
+++ memcg/mm/vmscan.c
@@ -2198,6 +2198,7 @@ unsigned long try_to_free_mem_cgroup_pag
{
struct zonelist *zonelist;
unsigned long nr_reclaimed;
+ int nid;
struct scan_control sc = {
.may_writepage = !laptop_mode,
.may_unmap = 1,
@@ -2208,10 +2209,16 @@ unsigned long try_to_free_mem_cgroup_pag
.mem_cgroup = mem_cont,
.nodemask = NULL, /* we don't care the placement */
};
+ /*
+ * Unlike direct reclaim via alloc_pages(), memcg's reclaim
+ * don't take care of from where we get pages . So, the node where
+ * we start scan is not needed to be current node.
+ */
+ nid = mem_cgroup_select_victim_node(mem_cont);

sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
- zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+ zonelist = NODE_DATA(nid)->node_zonelists;

trace_mm_vmscan_memcg_reclaim_begin(0,
sc.may_writepage,

2011-04-28 02:09:28

by Daisuke Nishimura

[permalink] [raw]
Subject: Re: [PATCHv4] memcg: reclaim memory from node in round-robin

On Thu, 28 Apr 2011 10:49:12 +0900
KAMEZAWA Hiroyuki <[email protected]> wrote:

> On Thu, 28 Apr 2011 10:37:05 +0900
> Daisuke Nishimura <[email protected]> wrote:
> > > + if (time_after(mem->next_scan_node_update, jiffies))
> > > + return;
> > > +
> > Shouldn't it be time_before() or time_after(jiffies, next_scan_node_update) ?
> >
> > Looks good to me, otherwise.
> >
>
> time_after(a, b) returns true when a is after b.....you're right.
> ==
> Now, memory cgroup's direct reclaim frees memory from the current node.
> But this has some troubles. In usual, when a set of threads works in
> cooperative way, they are tend to on the same node. So, if they hit
> limits under memcg, it will reclaim memory from themselves, it may be
> active working set.
>
> For example, assume 2 node system which has Node 0 and Node 1
> and a memcg which has 1G limit. After some work, file cacne remains and
> and usages are
> Node 0: 1M
> Node 1: 998M.
>
> and run an application on Node 0, it will eats its foot before freeing
> unnecessary file caches.
>
> This patch adds round-robin for NUMA and adds equal pressure to each
> node. When using cpuset's spread memory feature, this will work very well.
>
> But yes, better algorithm is appreciated.
>
> From: Ying Han <[email protected]>
> Signed-off-by: Ying Han <[email protected]>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
>
Acked-by: Daisuke Nishimura <[email protected]>

2011-04-28 02:49:53

by Ying Han

[permalink] [raw]
Subject: Re: [PATCHv2] memcg: reclaim memory from node in round-robin

On Wed, Apr 27, 2011 at 4:57 PM, KAMEZAWA Hiroyuki
<[email protected]> wrote:
> On Wed, 27 Apr 2011 10:33:43 -0700
> Ying Han <[email protected]> wrote:
>
>> On Wed, Apr 27, 2011 at 12:51 AM, KAMEZAWA Hiroyuki
>> <[email protected]> wrote:
>> > I changed the logic a little and add a filter for skipping nodes.
>> > With large NUMA, tasks may under cpuset or mempolicy and the usage of memory
>> > can be unbalanced. So, I think a filter is required.
>>
>> Thank you.
>>
>> >
>> > ==
>> > Now, memory cgroup's direct reclaim frees memory from the current node.
>> > But this has some troubles. In usual, when a set of threads works in
>> > cooperative way, they are tend to on the same node. So, if they hit
>> > limits under memcg, it will reclaim memory from themselves, it may be
>> > active working set.
>> >
>> > For example, assume 2 node system which has Node 0 and Node 1
>> > and a memcg which has 1G limit. After some work, file cacne remains and
>> > and usages are
>> > ? Node 0: ?1M
>> > ? Node 1: ?998M.
>> >
>> > and run an application on Node 0, it will eats its foot before freeing
>> > unnecessary file caches.
>> >
>> > This patch adds round-robin for NUMA and adds equal pressure to each
>> > node. When using cpuset's spread memory feature, this will work very well.
>> >
>> >
>> > From: Ying Han <[email protected]>
>> > Signed-off-by: Ying Han <[email protected]>
>> > Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
>> >
>> > Changelog v1->v2:
>> > ?- fixed comments.
>> > ?- added a logic to avoid scanning unused node.
>> >
>> > ---
>> > ?include/linux/memcontrol.h | ? ?1
>> > ?mm/memcontrol.c ? ? ? ? ? ?| ? 98 ++++++++++++++++++++++++++++++++++++++++++---
>> > ?mm/vmscan.c ? ? ? ? ? ? ? ?| ? ?9 +++-
>> > ?3 files changed, 101 insertions(+), 7 deletions(-)
>> >
>> > Index: memcg/include/linux/memcontrol.h
>> > ===================================================================
>> > --- memcg.orig/include/linux/memcontrol.h
>> > +++ memcg/include/linux/memcontrol.h
>> > @@ -108,6 +108,7 @@ extern void mem_cgroup_end_migration(str
>> > ?*/
>> > ?int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
>> > ?int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
>> > +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
>> > ?unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
>> > ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct zone *zone,
>> > ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? enum lru_list lru);
>> > Index: memcg/mm/memcontrol.c
>> > ===================================================================
>> > --- memcg.orig/mm/memcontrol.c
>> > +++ memcg/mm/memcontrol.c
>> > @@ -237,6 +237,11 @@ struct mem_cgroup {
>> > ? ? ? ? * reclaimed from.
>> > ? ? ? ? */
>> > ? ? ? ?int last_scanned_child;
>> > + ? ? ? int last_scanned_node;
>> > +#if MAX_NUMNODES > 1
>> > + ? ? ? nodemask_t ? ? ?scan_nodes;
>> > + ? ? ? unsigned long ? next_scan_node_update;
>> > +#endif
>> > ? ? ? ?/*
>> > ? ? ? ? * Should the accounting and control be hierarchical, per subtree?
>> > ? ? ? ? */
>> > @@ -650,18 +655,27 @@ static void mem_cgroup_soft_scan(struct
>> > ? ? ? ?this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_SOFT_SCAN], val);
>> > ?}
>> >
>> > +static unsigned long
>> > +mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
>> > +{
>> > + ? ? ? struct mem_cgroup_per_zone *mz;
>> > + ? ? ? u64 total;
>> > + ? ? ? int zid;
>> > +
>> > + ? ? ? for (zid = 0; zid < MAX_NR_ZONES; zid++) {
>> > + ? ? ? ? ? ? ? mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> > + ? ? ? ? ? ? ? total += MEM_CGROUP_ZSTAT(mz, idx);
>> > + ? ? ? }
>> > + ? ? ? return total;
>> > +}
>> > ?static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
>> > ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?enum lru_list idx)
>> > ?{
>> > - ? ? ? int nid, zid;
>> > - ? ? ? struct mem_cgroup_per_zone *mz;
>> > + ? ? ? int nid;
>> > ? ? ? ?u64 total = 0;
>> >
>> > ? ? ? ?for_each_online_node(nid)
>> > - ? ? ? ? ? ? ? for (zid = 0; zid < MAX_NR_ZONES; zid++) {
>> > - ? ? ? ? ? ? ? ? ? ? ? mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> > - ? ? ? ? ? ? ? ? ? ? ? total += MEM_CGROUP_ZSTAT(mz, idx);
>> > - ? ? ? ? ? ? ? }
>> > + ? ? ? ? ? ? ? total += mem_cgroup_get_zonestat_node(mem, nid, idx);
>> > ? ? ? ?return total;
>> > ?}
>> >
>> > @@ -1471,6 +1485,77 @@ mem_cgroup_select_victim(struct mem_cgro
>> > ? ? ? ?return ret;
>> > ?}
>> >
>> > +#if MAX_NUMNODES > 1
>> > +
>> > +/*
>> > + * Update nodemask always is not very good. Even if we have empty
>> > + * list, or wrong list here, we can start from some node and traverse all nodes
>> > + * based on zonelist. So, update the list loosely once in 10 secs.
>> > + *
>> > + */
>> > +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
>> > +{
>> > + ? ? ? int nid;
>> > +
>> > + ? ? ? if (time_after(mem->next_scan_node_update, jiffies))
>> > + ? ? ? ? ? ? ? return;
>> > +
>> > + ? ? ? mem->next_scan_node_update = jiffies + 10*HZ;
>> > + ? ? ? /* make a nodemask where this memcg uses memory from */
>> > + ? ? ? mem->scan_nodes = node_states[N_HIGH_MEMORY];
>> > +
>> > + ? ? ? for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
>> > +
>> > + ? ? ? ? ? ? ? if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
>> > + ? ? ? ? ? ? ? ? ? mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
>> > + ? ? ? ? ? ? ? ? ? ? ? continue;
>> > +
>> > + ? ? ? ? ? ? ? if (total_swap_pages &&
>> > + ? ? ? ? ? ? ? ? ? (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
>> > + ? ? ? ? ? ? ? ? ? ?mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
>> > + ? ? ? ? ? ? ? ? ? ? ? continue;
>> > + ? ? ? ? ? ? ? node_clear(nid, mem->scan_nodes);
>> > + ? ? ? }
>> > +
>> > +}
>> > +
>> > +/*
>> > + * Selecting a node where we start reclaim from. Because what we need is just
>> > + * reducing usage counter, start from anywhere is O,K. Considering
>> > + * memory reclaim from current node, there are pros. and cons.
>> > + *
>> > + * Freeing memory from current node means freeing memory from a node which
>> > + * we'll use or we've used. So, it may make LRU bad. And if several threads
>> > + * hit limits, it will see a contention on a node. But freeing from remote
>> > + * node means more costs for memory reclaim because of memory latency.
>> > + *
>> > + * Now, we use round-robin. Better algorithm is welcomed.
>> > + */
>> > +int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
>> > +{
>> > + ? ? ? int node;
>> > +
>> > + ? ? ? mem_cgroup_may_update_nodemask(mem);
>> > + ? ? ? node = mem->last_scanned_node;
>> > +
>> > + ? ? ? node = next_node(node, mem->scan_nodes);
>> > + ? ? ? if (node == MAX_NUMNODES) {
>> > + ? ? ? ? ? ? ? node = first_node(mem->scan_nodes);
>> > + ? ? ? ? ? ? ? if (unlikely(node == MAX_NUMNODES))
>> > + ? ? ? ? ? ? ? ? ? ? ? node = numa_node_id();
>> not sure about this logic, is that possible we reclaim from a node
>> with all "unreclaimable" pages (based on the
>> mem_cgroup_may_update_nodemask check).
>> If i missed anything here, it would be helpful to add comment.
>>
>
> What I'm afraid here is when a user uses very small memcg,
> all pages on the LRU may be isolated or all usages are in per-cpu cache
> of memcg or because of task-migration between memcg, it hits limit before
> having any pages on LRU.....I think there is possible corner cases which
> can cause hang.
>
> ok, will add comment.

Ok, thanks. Otherwise it looks good.

Acked-by: Ying Han <[email protected]>

--Ying

--Ying
>
> Thanks,
> -Kame
>
>
>
>
>
>
>

2011-05-04 21:32:31

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCHv4] memcg: reclaim memory from node in round-robin

On Thu, 28 Apr 2011 10:49:12 +0900
KAMEZAWA Hiroyuki <[email protected]> wrote:

> On Thu, 28 Apr 2011 10:37:05 +0900
> Daisuke Nishimura <[email protected]> wrote:
> > > + if (time_after(mem->next_scan_node_update, jiffies))
> > > + return;
> > > +
> > Shouldn't it be time_before() or time_after(jiffies, next_scan_node_update) ?
> >
> > Looks good to me, otherwise.
> >
>
> time_after(a, b) returns true when a is after b.....you're right.
> ==
> Now, memory cgroup's direct reclaim frees memory from the current node.
> But this has some troubles. In usual, when a set of threads works in
> cooperative way, they are tend to on the same node. So, if they hit
> limits under memcg, it will reclaim memory from themselves, it may be
> active working set.
>
> For example, assume 2 node system which has Node 0 and Node 1
> and a memcg which has 1G limit. After some work, file cacne remains and
> and usages are
> Node 0: 1M
> Node 1: 998M.
>
> and run an application on Node 0, it will eats its foot before freeing
> unnecessary file caches.
>
> This patch adds round-robin for NUMA and adds equal pressure to each
> node. When using cpuset's spread memory feature, this will work very well.
>
> But yes, better algorithm is appreciated.

That ten-second thing is a gruesome and ghastly hack, but didn't even
get a mention in the patch description?

Talk to us about it. Why is it there? What are the implications of
getting it wrong? What alternatives are there?

It would be much better to work out the optimum time at which to rotate
the index via some deterministic means.

If we can't think of a way of doing that then we should at least pace
the rotation frequency via something saner than wall-time. Such as
number-of-pages-scanned.

2011-05-06 06:19:43

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCHv4] memcg: reclaim memory from node in round-robin

On Wed, 4 May 2011 14:26:23 -0700
Andrew Morton <[email protected]> wrote:

> On Thu, 28 Apr 2011 10:49:12 +0900
> KAMEZAWA Hiroyuki <[email protected]> wrote:
>
> > On Thu, 28 Apr 2011 10:37:05 +0900
> > Daisuke Nishimura <[email protected]> wrote:
> > > > + if (time_after(mem->next_scan_node_update, jiffies))
> > > > + return;
> > > > +
> > > Shouldn't it be time_before() or time_after(jiffies, next_scan_node_update) ?
> > >
> > > Looks good to me, otherwise.
> > >
> >
> > time_after(a, b) returns true when a is after b.....you're right.
> > ==
> > Now, memory cgroup's direct reclaim frees memory from the current node.
> > But this has some troubles. In usual, when a set of threads works in
> > cooperative way, they are tend to on the same node. So, if they hit
> > limits under memcg, it will reclaim memory from themselves, it may be
> > active working set.
> >
> > For example, assume 2 node system which has Node 0 and Node 1
> > and a memcg which has 1G limit. After some work, file cacne remains and
> > and usages are
> > Node 0: 1M
> > Node 1: 998M.
> >
> > and run an application on Node 0, it will eats its foot before freeing
> > unnecessary file caches.
> >
> > This patch adds round-robin for NUMA and adds equal pressure to each
> > node. When using cpuset's spread memory feature, this will work very well.
> >
> > But yes, better algorithm is appreciated.
>
> That ten-second thing is a gruesome and ghastly hack, but didn't even
> get a mention in the patch description?
>
> Talk to us about it. Why is it there? What are the implications of
> getting it wrong? What alternatives are there?
>

Ah, sorry I couldn't think of fix to that levet, I posted.

> It would be much better to work out the optimum time at which to rotate
> the index via some deterministic means.
>
> If we can't think of a way of doing that then we should at least pace
> the rotation frequency via something saner than wall-time. Such as
> number-of-pages-scanned.
>


What I think now is using reclaim_stat or usigng some fairness based on
the ratio of inactive file caches. We can calculate the total sum of
recalaim_stat which gives us a scan_ratio for a whole memcg. And we can
calculate LRU rotate/scan ratio per node. If rotate/scan ratio is small,
it will be a good candidate of reclaim target. Hmm,

- check which memory(anon or file) should be scanned.
(If file is too small, rotate/scan ratio of file is meaningless.)
- check rotate/scan ratio of each nodes.
- calculate weights for each nodes (by some logic ?)
- give a fair scan w.r.t node's weight.

Hmm, I'll have a study on this.

Thanks.
-Kame












2011-05-09 02:20:35

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [PATCHv2] memcg: reclaim memory from node in round-robin

> I changed the logic a little and add a filter for skipping nodes.
> With large NUMA, tasks may under cpuset or mempolicy and the usage of memory
> can be unbalanced. So, I think a filter is required.
>
> ==
> Now, memory cgroup's direct reclaim frees memory from the current node.
> But this has some troubles. In usual, when a set of threads works in
> cooperative way, they are tend to on the same node. So, if they hit
> limits under memcg, it will reclaim memory from themselves, it may be
> active working set.
>
> For example, assume 2 node system which has Node 0 and Node 1
> and a memcg which has 1G limit. After some work, file cacne remains and
> and usages are
> Node 0: 1M
> Node 1: 998M.
>
> and run an application on Node 0, it will eats its foot before freeing
> unnecessary file caches.
>
> This patch adds round-robin for NUMA and adds equal pressure to each
> node. When using cpuset's spread memory feature, this will work very well.

Looks nice. And it would be more nice if global reclaim has the same feature.
Do you have a plan to do it?

2011-05-09 02:37:13

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCHv2] memcg: reclaim memory from node in round-robin

On Mon, 9 May 2011 11:20:31 +0900 (JST)
KOSAKI Motohiro <[email protected]> wrote:

> > I changed the logic a little and add a filter for skipping nodes.
> > With large NUMA, tasks may under cpuset or mempolicy and the usage of memory
> > can be unbalanced. So, I think a filter is required.
> >
> > ==
> > Now, memory cgroup's direct reclaim frees memory from the current node.
> > But this has some troubles. In usual, when a set of threads works in
> > cooperative way, they are tend to on the same node. So, if they hit
> > limits under memcg, it will reclaim memory from themselves, it may be
> > active working set.
> >
> > For example, assume 2 node system which has Node 0 and Node 1
> > and a memcg which has 1G limit. After some work, file cacne remains and
> > and usages are
> > Node 0: 1M
> > Node 1: 998M.
> >
> > and run an application on Node 0, it will eats its foot before freeing
> > unnecessary file caches.
> >
> > This patch adds round-robin for NUMA and adds equal pressure to each
> > node. When using cpuset's spread memory feature, this will work very well.
>
> Looks nice. And it would be more nice if global reclaim has the same feature.
> Do you have a plan to do it?
>

Hmm, IIUC, at allocating memory for file-cache, we may be able to avoid starting
from current node. But, isn't it be a feature of cpuset ?
If cpuset.memory_spread_page==1 and a page for file is allocated from a node in
round-robin, and memory reclaim runs in such manner (using node-only zonelist
fallabck).

Do you mean the kernel should have a knob for allowing non-local allocation for
file caches even without cpuset ?

Thanks,
-Kame


2011-05-26 19:52:34

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCHv4] memcg: reclaim memory from node in round-robin

On Fri, 6 May 2011 15:13:02 +0900
KAMEZAWA Hiroyuki <[email protected]> wrote:

> > It would be much better to work out the optimum time at which to rotate
> > the index via some deterministic means.
> >
> > If we can't think of a way of doing that then we should at least pace
> > the rotation frequency via something saner than wall-time. Such as
> > number-of-pages-scanned.
> >
>
>
> What I think now is using reclaim_stat or usigng some fairness based on
> the ratio of inactive file caches. We can calculate the total sum of
> recalaim_stat which gives us a scan_ratio for a whole memcg. And we can
> calculate LRU rotate/scan ratio per node. If rotate/scan ratio is small,
> it will be a good candidate of reclaim target. Hmm,
>
> - check which memory(anon or file) should be scanned.
> (If file is too small, rotate/scan ratio of file is meaningless.)
> - check rotate/scan ratio of each nodes.
> - calculate weights for each nodes (by some logic ?)
> - give a fair scan w.r.t node's weight.
>
> Hmm, I'll have a study on this.

How's the study coming along ;)

I'll send this in to Linus today, but I'll feel grumpy while doing so.
We really should do something smarter here - the magic constant will
basically always be suboptimal for everyone and we end up tweaking its
value (if we don't, then the feature just wasn't valuable in the first
place) and then we add a tunable and then people try to tweak the
default setting of the tunable and then I deride them for not setting
the tunable in initscripts and then we have to maintain the stupid
tunable after we've changed the internal implementation and it's all
basically screwed up.

How to we automatically determine the optimum time at which to rotate,
at runtime?

2011-05-27 00:01:31

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCHv4] memcg: reclaim memory from node in round-robin

On Thu, 26 May 2011 12:52:07 -0700
Andrew Morton <[email protected]> wrote:

> On Fri, 6 May 2011 15:13:02 +0900
> KAMEZAWA Hiroyuki <[email protected]> wrote:
>
> > > It would be much better to work out the optimum time at which to rotate
> > > the index via some deterministic means.
> > >
> > > If we can't think of a way of doing that then we should at least pace
> > > the rotation frequency via something saner than wall-time. Such as
> > > number-of-pages-scanned.
> > >
> >
> >
> > What I think now is using reclaim_stat or usigng some fairness based on
> > the ratio of inactive file caches. We can calculate the total sum of
> > recalaim_stat which gives us a scan_ratio for a whole memcg. And we can
> > calculate LRU rotate/scan ratio per node. If rotate/scan ratio is small,
> > it will be a good candidate of reclaim target. Hmm,
> >
> > - check which memory(anon or file) should be scanned.
> > (If file is too small, rotate/scan ratio of file is meaningless.)
> > - check rotate/scan ratio of each nodes.
> > - calculate weights for each nodes (by some logic ?)
> > - give a fair scan w.r.t node's weight.
> >
> > Hmm, I'll have a study on this.
>
> How's the study coming along ;)
>
> I'll send this in to Linus today, but I'll feel grumpy while doing so.
> We really should do something smarter here - the magic constant will
> basically always be suboptimal for everyone and we end up tweaking its
> value (if we don't, then the feature just wasn't valuable in the first
> place) and then we add a tunable and then people try to tweak the
> default setting of the tunable and then I deride them for not setting
> the tunable in initscripts and then we have to maintain the stupid
> tunable after we've changed the internal implementation and it's all
> basically screwed up.
>
> How to we automatically determine the optimum time at which to rotate,
> at runtime?
>

Ah, I think I should check it after dirty page accounting comes...because
ratio of dirty pages is an important information..

Ok, what I think now is just comparing the number of INACTIVE_FILE or the number
of FILE CACHES per node.

I think we can periodically update per-node and total amount of file caches
and we can record per-node
node-file-cache * 100/ total-file cache
information into memcg's per-node structure.

Then, I think we can do some scheduling like lottery scheduling, a scan proportional
to the ratio of file caches in the memcg. If it's better to check INACTIVE_ANON,
I think swappiness can be used in above calcuration.

But yes, I or someone may be able to think of something much better.

Thanks,
-Kame




2011-05-27 02:46:04

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCHv4] memcg: reclaim memory from node in round-robin

On Fri, 27 May 2011 08:54:40 +0900
KAMEZAWA Hiroyuki <[email protected]> wrote:

> On Thu, 26 May 2011 12:52:07 -0700
> Andrew Morton <[email protected]> wrote:
>
> > On Fri, 6 May 2011 15:13:02 +0900
> > KAMEZAWA Hiroyuki <[email protected]> wrote:
> >
> > > > It would be much better to work out the optimum time at which to rotate
> > > > the index via some deterministic means.
> > > >
> > > > If we can't think of a way of doing that then we should at least pace
> > > > the rotation frequency via something saner than wall-time. Such as
> > > > number-of-pages-scanned.
> > > >
> > >
> > >
> > > What I think now is using reclaim_stat or usigng some fairness based on
> > > the ratio of inactive file caches. We can calculate the total sum of
> > > recalaim_stat which gives us a scan_ratio for a whole memcg. And we can
> > > calculate LRU rotate/scan ratio per node. If rotate/scan ratio is small,
> > > it will be a good candidate of reclaim target. Hmm,
> > >
> > > - check which memory(anon or file) should be scanned.
> > > (If file is too small, rotate/scan ratio of file is meaningless.)
> > > - check rotate/scan ratio of each nodes.
> > > - calculate weights for each nodes (by some logic ?)
> > > - give a fair scan w.r.t node's weight.
> > >
> > > Hmm, I'll have a study on this.
> >
> > How's the study coming along ;)
> >
> > I'll send this in to Linus today, but I'll feel grumpy while doing so.
> > We really should do something smarter here - the magic constant will
> > basically always be suboptimal for everyone and we end up tweaking its
> > value (if we don't, then the feature just wasn't valuable in the first
> > place) and then we add a tunable and then people try to tweak the
> > default setting of the tunable and then I deride them for not setting
> > the tunable in initscripts and then we have to maintain the stupid
> > tunable after we've changed the internal implementation and it's all
> > basically screwed up.
> >
> > How to we automatically determine the optimum time at which to rotate,
> > at runtime?
> >
>
> Ah, I think I should check it after dirty page accounting comes...because
> ratio of dirty pages is an important information..
>
> Ok, what I think now is just comparing the number of INACTIVE_FILE or the number
> of FILE CACHES per node.
>
> I think we can periodically update per-node and total amount of file caches
> and we can record per-node
> node-file-cache * 100/ total-file cache
> information into memcg's per-node structure.
>

Hmmm..something like this ?
==
This will not be able to be applied mmotm directly.
This patch is made from tons of magic numbers....I need more study
and will be able to write a simple one.

At first, mem_cgroup can reclaim memory from anywhere, it just checks
amount of memory. Now, victim node to be reclaimed is just determined
by round-robin.

This patch adds a scheduler simliar to a weighted fair share scanning
among nodes. Now, we periodically update mem->scan_nodes to know
which node has evictable memory. This patch gathers more information.

This patch caluculate "weight" of node as

(nr_inactive_file + nr_active_file/10) * (200-swappiness)
+ (nr_inactive_anon) * (swappiness)
(see vmscan.c::get_scan_count() for meaning of swappiness)

And select some nodes in a fair way proportional to the weight.
selected nodes are cached into mem->victim_nodes, victime_nodes
will be visited in round robin.

Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
mm/memcontrol.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 84 insertions(+), 18 deletions(-)

Index: memcg_async/mm/memcontrol.c
===================================================================
--- memcg_async.orig/mm/memcontrol.c
+++ memcg_async/mm/memcontrol.c
@@ -48,6 +48,7 @@
#include <linux/page_cgroup.h>
#include <linux/cpu.h>
#include <linux/oom.h>
+#include <linux/random.h>
#include "internal.h"

#include <asm/uaccess.h>
@@ -149,6 +150,7 @@ struct mem_cgroup_per_zone {
#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])

struct mem_cgroup_per_node {
+ u64 scan_weight;
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};

@@ -257,6 +259,7 @@ struct mem_cgroup {
int last_scanned_node;
#if MAX_NUMNODES > 1
nodemask_t scan_nodes;
+ nodemask_t victim_nodes;
unsigned long next_scan_node_update;
#endif
/*
@@ -1732,9 +1735,21 @@ u64 mem_cgroup_get_limit(struct mem_cgro
* nodes based on the zonelist. So update the list loosely once per 10 secs.
*
*/
+
+/*
+ * This is for selecting a victim node with lottery proportional share
+ * scheduling. This LOTTEY value can be arbitrary but must be higher
+ * than max number of nodes.
+ */
+#define NODE_SCAN_LOTTERY (1 << 15)
+#define NODE_SCAN_LOTTERY_MASK (NODE_SCAN_LOTTERY - 1)
+
static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem, bool force)
{
int nid;
+ u64 total_weight;
+ unsigned long swappiness;
+ int nr_selection;

if (!force && time_after(mem->next_scan_node_update, jiffies))
return;
@@ -1742,18 +1757,77 @@ static void mem_cgroup_may_update_nodema
mem->next_scan_node_update = jiffies + 10*HZ;
/* make a nodemask where this memcg uses memory from */
mem->scan_nodes = node_states[N_HIGH_MEMORY];
+ nodes_clear(mem->victim_nodes);
+
+ swappiness = mem_cgroup_swappiness(mem);
+ total_weight = 0;

for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+ u64 val, file_weight, anon_weight, pages;
+ int lru;

- if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
- mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
- continue;
+ lru = LRU_INACTIVE_FILE;
+ val = mem_cgroup_get_zonestat_node(mem, nid, lru);
+ file_weight = val;
+ pages = val;

- if (total_swap_pages &&
- (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
- mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
- continue;
- node_clear(nid, mem->scan_nodes);
+ lru = LRU_ACTIVE_FILE;
+ val = mem_cgroup_get_zonestat_node(mem, nid, lru);
+ /*
+ * This is a magic calculation. We add 10% of active file
+ * to weight. This should be tweaked..
+ */
+ if (val)
+ file_weight += val/10;
+ pages += val;
+
+ if (total_swap_pages) {
+ lru = LRU_INACTIVE_ANON;
+ val = mem_cgroup_get_zonestat_node(mem, nid, lru);
+ anon_weight = val;
+ pages += val;
+ lru = LRU_ACTIVE_ANON;
+ val = mem_cgroup_get_zonestat_node(mem, nid, lru);
+ /*
+ * Magic again. We don't want to active_anon take into
+ * account but cannot ignore....add +1.
+ */
+ if (val)
+ anon_weight += 1;
+ pages += val;
+ } else
+ anon_weight = 0;
+ mem->info.nodeinfo[nid]->scan_weight =
+ file_weight * (200 - swappiness) +
+ anon_weight * swappiness;
+ if (!pages)
+ node_clear(nid, mem->scan_nodes);
+
+ total_weight += mem->info.nodeinfo[nid]->scan_weight;
+ }
+ /* NORMALIZE weight information.*/
+ for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+
+ mem->info.nodeinfo[nid]->scan_weight =
+ mem->info.nodeinfo[nid]->scan_weight
+ * NODE_SCAN_LOTTERY/ total_weight;
+ }
+ /*
+ * because checking lottery at every scan is heavy. we cache
+ * some results. These victims will be used for the next 10sec.
+ * Even if scan_nodes is empty, the victim_nodes includes node 0
+ * at least.
+ */
+ nr_selection = int_sqrt(nodes_weight(mem->scan_nodes)) + 1;
+
+ while (nr_selection >= 0) {
+ int lottery = random32();
+ for_each_node_mask(nid, mem->scan_nodes) {
+ lottery -= mem->info.nodeinfo[nid]->scan_weight;
+ if (lottery <= 0)
+ break;
+ }
+ node_set(nid, mem->victim_nodes);
}
}

@@ -1776,17 +1850,9 @@ int mem_cgroup_select_victim_node(struct
mem_cgroup_may_update_nodemask(mem, false);
node = mem->last_scanned_node;

- node = next_node(node, mem->scan_nodes);
+ node = next_node(node, mem->victim_nodes);
if (node == MAX_NUMNODES)
- node = first_node(mem->scan_nodes);
- /*
- * We call this when we hit limit, not when pages are added to LRU.
- * No LRU may hold pages because all pages are UNEVICTABLE or
- * memcg is too small and all pages are not on LRU. In that case,
- * we use curret node.
- */
- if (unlikely(node == MAX_NUMNODES))
- node = numa_node_id();
+ node = first_node(mem->victim_nodes);

mem->last_scanned_node = node;
return node;