From: Greg Thelen <[email protected]>
Added memcg dirty page accounting support routines. These routines are
used by later changes to provide memcg aware writeback and dirty page
limiting. A mem_cgroup_dirty_info() tracepoint is is also included to
allow for easier understanding of memcg writeback operation.
Signed-off-by: Greg Thelen <[email protected]>
Signed-off-by: Fengguang Wu <[email protected]>
---
Changelog since v8:
- Use 'memcg' rather than 'mem' for local variables and parameters.
This is consistent with other memory controller code.
include/linux/memcontrol.h | 5 +
mm/memcontrol.c | 112 +++++++++++++++++++++++++++++++++++
2 files changed, 117 insertions(+)
--- linux.orig/include/linux/memcontrol.h 2012-02-25 20:48:34.337580646 +0800
+++ linux/include/linux/memcontrol.h 2012-02-25 20:48:34.361580646 +0800
@@ -36,8 +36,13 @@ enum mem_cgroup_page_stat_item {
MEMCG_NR_FILE_DIRTY, /* # of dirty pages in page cache */
MEMCG_NR_FILE_WRITEBACK, /* # of pages under writeback */
MEMCG_NR_FILE_UNSTABLE_NFS, /* # of NFS unstable pages */
+ MEMCG_NR_DIRTYABLE_PAGES, /* # of pages that could be dirty */
};
+unsigned long mem_cgroup_page_stat(struct mem_cgroup *memcg,
+ enum mem_cgroup_page_stat_item item);
+unsigned long mem_cgroup_dirty_pages(struct mem_cgroup *memcg);
+
struct mem_cgroup_reclaim_cookie {
struct zone *zone;
int priority;
--- linux.orig/mm/memcontrol.c 2012-02-25 20:48:34.337580646 +0800
+++ linux/mm/memcontrol.c 2012-02-25 21:09:54.073554384 +0800
@@ -1255,6 +1255,118 @@ int mem_cgroup_swappiness(struct mem_cgr
return memcg->swappiness;
}
+static inline bool mem_cgroup_can_swap(struct mem_cgroup *memcg)
+{
+ if (nr_swap_pages == 0)
+ return false;
+ if (!do_swap_account)
+ return true;
+ if (memcg->memsw_is_minimum)
+ return false;
+ if (res_counter_margin(&memcg->memsw) == 0)
+ return false;
+ return true;
+}
+
+static s64 mem_cgroup_local_page_stat(struct mem_cgroup *memcg,
+ enum mem_cgroup_page_stat_item item)
+{
+ s64 ret;
+
+ switch (item) {
+ case MEMCG_NR_FILE_DIRTY:
+ ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_DIRTY);
+ break;
+ case MEMCG_NR_FILE_WRITEBACK:
+ ret = mem_cgroup_read_stat(memcg,
+ MEM_CGROUP_STAT_FILE_WRITEBACK);
+ break;
+ case MEMCG_NR_FILE_UNSTABLE_NFS:
+ ret = mem_cgroup_read_stat(memcg,
+ MEM_CGROUP_STAT_FILE_UNSTABLE_NFS);
+ break;
+ case MEMCG_NR_DIRTYABLE_PAGES:
+ ret = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)) +
+ mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
+ if (mem_cgroup_can_swap(memcg))
+ ret += mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)) +
+ mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
+ break;
+ default:
+ BUG();
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Return the number of additional pages that the @memcg cgroup could allocate.
+ * If use_hierarchy is set, then this involves checking parent mem cgroups to
+ * find the cgroup with the smallest free space.
+ */
+static unsigned long
+mem_cgroup_hierarchical_free_pages(struct mem_cgroup *memcg)
+{
+ u64 free;
+ unsigned long min_free;
+
+ min_free = global_page_state(NR_FREE_PAGES);
+
+ while (memcg) {
+ free = mem_cgroup_margin(memcg);
+ min_free = min_t(u64, min_free, free);
+ memcg = parent_mem_cgroup(memcg);
+ }
+
+ return min_free;
+}
+
+/*
+ * mem_cgroup_page_stat() - get memory cgroup file cache statistics
+ * @memcg: memory cgroup to query
+ * @item: memory statistic item exported to the kernel
+ *
+ * Return the accounted statistic value.
+ */
+unsigned long mem_cgroup_page_stat(struct mem_cgroup *memcg,
+ enum mem_cgroup_page_stat_item item)
+{
+ struct mem_cgroup *iter;
+ s64 value;
+
+ /*
+ * If we're looking for dirtyable pages we need to evaluate free pages
+ * depending on the limit and usage of the parents first of all.
+ */
+ if (item == MEMCG_NR_DIRTYABLE_PAGES)
+ value = mem_cgroup_hierarchical_free_pages(memcg);
+ else
+ value = 0;
+
+ /*
+ * Recursively evaluate page statistics against all cgroup under
+ * hierarchy tree
+ */
+ for_each_mem_cgroup_tree(iter, memcg)
+ value += mem_cgroup_local_page_stat(iter, item);
+
+ /*
+ * Summing of unlocked per-cpu counters is racy and may yield a slightly
+ * negative value. Zero is the only sensible value in such cases.
+ */
+ if (unlikely(value < 0))
+ value = 0;
+
+ return value;
+}
+
+unsigned long mem_cgroup_dirty_pages(struct mem_cgroup *memcg)
+{
+ return mem_cgroup_page_stat(memcg, MEMCG_NR_FILE_DIRTY) +
+ mem_cgroup_page_stat(memcg, MEMCG_NR_FILE_WRITEBACK) +
+ mem_cgroup_page_stat(memcg, MEMCG_NR_FILE_UNSTABLE_NFS);
+}
+
static void mem_cgroup_start_move(struct mem_cgroup *memcg)
{
int cpu;
On Tue, Feb 28, 2012 at 10:00:26PM +0800, Fengguang Wu wrote:
> From: Greg Thelen <[email protected]>
>
> Added memcg dirty page accounting support routines. These routines are
> used by later changes to provide memcg aware writeback and dirty page
> limiting. A mem_cgroup_dirty_info() tracepoint is is also included to
> allow for easier understanding of memcg writeback operation.
Greg, sorry that the mem_cgroup_dirty_info() interfaces and
tracepoints are abridged since they are not used here. Obviously this
patch series is not enough to keep the number of dirty pages under
control. It only tries to improve page reclaim behavior given whatever
dirty number. We'll need further schemes to keep dirty pages under
sane levels, so that unrelated tasks do not suffer from reclaim waits
when there are heavy writers in the same memcg.
Thanks,
Fengguang
On Tue, 28 Feb 2012 22:00:26 +0800
Fengguang Wu <[email protected]> wrote:
> From: Greg Thelen <[email protected]>
>
> Added memcg dirty page accounting support routines. These routines are
> used by later changes to provide memcg aware writeback and dirty page
> limiting. A mem_cgroup_dirty_info() tracepoint is is also included to
> allow for easier understanding of memcg writeback operation.
>
> ...
>
> +/*
> + * Return the number of additional pages that the @memcg cgroup could allocate.
> + * If use_hierarchy is set, then this involves checking parent mem cgroups to
> + * find the cgroup with the smallest free space.
> + */
Comment needs revisting - use_hierarchy does not exist.
> +static unsigned long
> +mem_cgroup_hierarchical_free_pages(struct mem_cgroup *memcg)
> +{
> + u64 free;
> + unsigned long min_free;
> +
> + min_free = global_page_state(NR_FREE_PAGES);
> +
> + while (memcg) {
> + free = mem_cgroup_margin(memcg);
> + min_free = min_t(u64, min_free, free);
> + memcg = parent_mem_cgroup(memcg);
> + }
> +
> + return min_free;
> +}
> +
> +/*
> + * mem_cgroup_page_stat() - get memory cgroup file cache statistics
> + * @memcg: memory cgroup to query
> + * @item: memory statistic item exported to the kernel
> + *
> + * Return the accounted statistic value.
> + */
> +unsigned long mem_cgroup_page_stat(struct mem_cgroup *memcg,
> + enum mem_cgroup_page_stat_item item)
> +{
> + struct mem_cgroup *iter;
> + s64 value;
> +
> + /*
> + * If we're looking for dirtyable pages we need to evaluate free pages
> + * depending on the limit and usage of the parents first of all.
> + */
> + if (item == MEMCG_NR_DIRTYABLE_PAGES)
> + value = mem_cgroup_hierarchical_free_pages(memcg);
> + else
> + value = 0;
> +
> + /*
> + * Recursively evaluate page statistics against all cgroup under
> + * hierarchy tree
> + */
> + for_each_mem_cgroup_tree(iter, memcg)
> + value += mem_cgroup_local_page_stat(iter, item);
What's the locking rule for for_each_mem_cgroup_tree()? It's unobvious
from the code and isn't documented?
> + /*
> + * Summing of unlocked per-cpu counters is racy and may yield a slightly
> + * negative value. Zero is the only sensible value in such cases.
> + */
> + if (unlikely(value < 0))
> + value = 0;
> +
> + return value;
> +}
> +
>
> ...
>
On Tue, 28 Feb 2012 14:45:07 -0800
Andrew Morton <[email protected]> wrote:
> On Tue, 28 Feb 2012 22:00:26 +0800
> Fengguang Wu <[email protected]> wrote:
>
> > From: Greg Thelen <[email protected]>
> >
> > Added memcg dirty page accounting support routines. These routines are
> > used by later changes to provide memcg aware writeback and dirty page
> > limiting. A mem_cgroup_dirty_info() tracepoint is is also included to
> > allow for easier understanding of memcg writeback operation.
> >
> > ...
> >
> > +/*
> > + * Return the number of additional pages that the @memcg cgroup could allocate.
> > + * If use_hierarchy is set, then this involves checking parent mem cgroups to
> > + * find the cgroup with the smallest free space.
> > + */
>
> Comment needs revisting - use_hierarchy does not exist.
>
> > +static unsigned long
> > +mem_cgroup_hierarchical_free_pages(struct mem_cgroup *memcg)
> > +{
> > + u64 free;
> > + unsigned long min_free;
> > +
> > + min_free = global_page_state(NR_FREE_PAGES);
> > +
> > + while (memcg) {
> > + free = mem_cgroup_margin(memcg);
> > + min_free = min_t(u64, min_free, free);
> > + memcg = parent_mem_cgroup(memcg);
> > + }
> > +
> > + return min_free;
> > +}
> > +
> > +/*
> > + * mem_cgroup_page_stat() - get memory cgroup file cache statistics
> > + * @memcg: memory cgroup to query
> > + * @item: memory statistic item exported to the kernel
> > + *
> > + * Return the accounted statistic value.
> > + */
> > +unsigned long mem_cgroup_page_stat(struct mem_cgroup *memcg,
> > + enum mem_cgroup_page_stat_item item)
> > +{
> > + struct mem_cgroup *iter;
> > + s64 value;
> > +
> > + /*
> > + * If we're looking for dirtyable pages we need to evaluate free pages
> > + * depending on the limit and usage of the parents first of all.
> > + */
> > + if (item == MEMCG_NR_DIRTYABLE_PAGES)
> > + value = mem_cgroup_hierarchical_free_pages(memcg);
> > + else
> > + value = 0;
> > +
> > + /*
> > + * Recursively evaluate page statistics against all cgroup under
> > + * hierarchy tree
> > + */
> > + for_each_mem_cgroup_tree(iter, memcg)
> > + value += mem_cgroup_local_page_stat(iter, item);
>
> What's the locking rule for for_each_mem_cgroup_tree()? It's unobvious
> from the code and isn't documented?
>
Because for_each_mem_cgroup_tree() uses rcu_read_lock() and referernce counting
internally, it's not required to take any lock in callers.
One rule is the caller shoud call mem_cgroup_iter_break() if he want to break
the loop.
Thanks,
-Kame