Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1758527AbZLOCCG (ORCPT ); Mon, 14 Dec 2009 21:02:06 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1758332AbZLOCCB (ORCPT ); Mon, 14 Dec 2009 21:02:01 -0500 Received: from fgwmail7.fujitsu.co.jp ([192.51.44.37]:37207 "EHLO fgwmail7.fujitsu.co.jp" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1758303AbZLOCCA (ORCPT ); Mon, 14 Dec 2009 21:02:00 -0500 X-SecurityPolicyCheck-FJ: OK by FujitsuOutboundMailChecker v1.3.1 Date: Tue, 15 Dec 2009 10:58:50 +0900 From: KAMEZAWA Hiroyuki To: "Kirill A. Shutemov" Cc: containers@lists.linux-foundation.org, linux-mm@kvack.org, Paul Menage , Li Zefan , Andrew Morton , Balbir Singh , Pavel Emelyanov , Dan Malek , Vladislav Buzov , Daisuke Nishimura , linux-kernel@vger.kernel.org Subject: Re: [PATCH RFC v2 4/4] memcg: implement memory thresholds Message-Id: <20091215105850.87203454.kamezawa.hiroyu@jp.fujitsu.com> In-Reply-To: <9e6e8d687224c6cbc54281f7c3d07983f701f93d.1260571675.git.kirill@shutemov.name> References: <747ea0ec22b9348208c80f86f7a813728bf8e50a.1260571675.git.kirill@shutemov.name> <9e6e8d687224c6cbc54281f7c3d07983f701f93d.1260571675.git.kirill@shutemov.name> Organization: FUJITSU Co. LTD. X-Mailer: Sylpheed 2.5.0 (GTK+ 2.10.14; i686-pc-mingw32) Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 14248 Lines: 460 On Sat, 12 Dec 2009 00:59:19 +0200 "Kirill A. Shutemov" wrote: > It allows to register multiple memory and memsw thresholds and gets > notifications when it crosses. > > To register a threshold application need: > - create an eventfd; > - open memory.usage_in_bytes or memory.memsw.usage_in_bytes; > - write string like " " to > cgroup.event_control. > > Application will be notified through eventfd when memory usage crosses > threshold in any direction. > > It's applicable for root and non-root cgroup. > > It uses stats to track memory usage, simmilar to soft limits. It checks > if we need to send event to userspace on every 100 page in/out. I guess > it's good compromise between performance and accuracy of thresholds. > > Signed-off-by: Kirill A. Shutemov > --- > mm/memcontrol.c | 263 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 files changed, 263 insertions(+), 0 deletions(-) > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index c6081cc..5ba2140 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -6,6 +6,10 @@ > * Copyright 2007 OpenVZ SWsoft Inc > * Author: Pavel Emelianov > * > + * Memory thresholds > + * Copyright (C) 2009 Nokia Corporation > + * Author: Kirill A. Shutemov > + * > * This program is free software; you can redistribute it and/or modify > * it under the terms of the GNU General Public License as published by > * the Free Software Foundation; either version 2 of the License, or > @@ -38,6 +42,7 @@ > #include > #include > #include > +#include > #include "internal.h" > > #include > @@ -56,6 +61,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ > > static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ > #define SOFTLIMIT_EVENTS_THRESH (1000) > +#define THRESHOLDS_EVENTS_THRESH (100) > > /* > * Statistics for memory cgroup. > @@ -72,6 +78,8 @@ enum mem_cgroup_stat_index { > MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ > MEM_CGROUP_STAT_SOFTLIMIT, /* decrements on each page in/out. > used by soft limit implementation */ > + MEM_CGROUP_STAT_THRESHOLDS, /* decrements on each page in/out. > + used by threshold implementation */ > > MEM_CGROUP_STAT_NSTATS, > }; > @@ -182,6 +190,15 @@ struct mem_cgroup_tree { > > static struct mem_cgroup_tree soft_limit_tree __read_mostly; > > +struct mem_cgroup_threshold { > + struct list_head list; > + struct eventfd_ctx *eventfd; > + u64 threshold; > +}; > + > +static bool mem_cgroup_threshold_check(struct mem_cgroup* mem); > +static void mem_cgroup_threshold(struct mem_cgroup* mem, bool swap); > + > /* > * The memory controller data structure. The memory controller controls both > * page cache and RSS per cgroup. We would eventually like to provide > @@ -233,6 +250,19 @@ struct mem_cgroup { > /* set when res.limit == memsw.limit */ > bool memsw_is_minimum; > > + /* protect lists of thresholds*/ > + spinlock_t thresholds_lock; > + > + /* thresholds for memory usage */ > + struct list_head thresholds; > + struct mem_cgroup_threshold *below_threshold; > + struct mem_cgroup_threshold *above_threshold; > + > + /* thresholds for mem+swap usage */ > + struct list_head memsw_thresholds; > + struct mem_cgroup_threshold *memsw_below_threshold; > + struct mem_cgroup_threshold *memsw_above_threshold; > + > /* > * statistics. This must be placed at the end of memcg. > */ > @@ -519,6 +549,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, > __mem_cgroup_stat_add_safe(cpustat, > MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); > __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT, -1); > + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS, -1); > + > put_cpu(); > } > > @@ -1363,6 +1395,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, > if (mem_cgroup_soft_limit_check(mem)) > mem_cgroup_update_tree(mem, page); > done: > + if (mem_cgroup_threshold_check(mem)) { > + mem_cgroup_threshold(mem, false); > + if (do_swap_account) > + mem_cgroup_threshold(mem, true); > + } > return 0; > nomem: > css_put(&mem->css); > @@ -1906,6 +1943,11 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) > > if (mem_cgroup_soft_limit_check(mem)) > mem_cgroup_update_tree(mem, page); > + if (mem_cgroup_threshold_check(mem)) { > + mem_cgroup_threshold(mem, false); > + if (do_swap_account) > + mem_cgroup_threshold(mem, true); > + } > /* at swapout, this memcg will be accessed to record to swap */ > if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) > css_put(&mem->css); > @@ -2860,11 +2902,181 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, > } > > > +static bool mem_cgroup_threshold_check(struct mem_cgroup *mem) > +{ > + bool ret = false; > + int cpu; > + s64 val; > + struct mem_cgroup_stat_cpu *cpustat; > + > + cpu = get_cpu(); > + cpustat = &mem->stat.cpustat[cpu]; > + val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_THRESHOLDS); > + if (unlikely(val < 0)) { > + __mem_cgroup_stat_set(cpustat, MEM_CGROUP_STAT_THRESHOLDS, > + THRESHOLDS_EVENTS_THRESH); > + ret = true; > + } > + put_cpu(); > + return ret; > +} > + Hmm. please check if (likely(list_empty(&mem->thesholds) && list_empty(&mem->memsw_thresholds))) return; or adds a flag as mem->no_threshold_check to skip this routine quickly. _OR_ I personally don't like to have 2 counters to catch events. How about this ? adds struct mem_cgroup { atomic_t event_counter; // this is incremented per 32 page-in/out atomic_t last_softlimit_check; atomic_t last_thresh_check; }; static bool mem_cgroup_threshold_check(struct mem_cgroup *mem) { decrement percpu event counter. if (percpu counter reaches 0) { if (atomic_dec_and_test(&mem->check_thresh) { check threashold. reset counter. } if (atomic_dec_and_test(&memc->check_softlimit) { update softlimit tree. reset counter. } reset percpu counter. } } Then, you can have a counter like system-wide event counter. > +static void mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) > +{ > + struct mem_cgroup_threshold **below, **above; > + struct list_head *thresholds; > + u64 usage = mem_cgroup_usage(memcg, swap); > + > + if (!swap) { > + thresholds = &memcg->thresholds; > + above = &memcg->above_threshold; > + below = &memcg->below_threshold; > + } else { > + thresholds = &memcg->memsw_thresholds; > + above = &memcg->memsw_above_threshold; > + below = &memcg->memsw_below_threshold; > + } > + > + spin_lock(&memcg->thresholds_lock); > + if ((*above)->threshold <= usage) { > + *below = *above; > + list_for_each_entry_continue((*above), thresholds, list) { > + eventfd_signal((*below)->eventfd, 1); > + if ((*above)->threshold > usage) > + break; > + *below = *above; > + } > + } else if ((*below)->threshold > usage) { > + *above = *below; > + list_for_each_entry_continue_reverse((*below), thresholds, > + list) { > + eventfd_signal((*above)->eventfd, 1); > + if ((*below)->threshold <= usage) > + break; > + *above = *below; > + } > + } > + spin_unlock(&memcg->thresholds_lock); > +} Could you adds comment on above check ? And do we need *spin_lock* here ? Can't you use RCU list walk ? If you use have to use spinlock here, this is a system-wide spinlock, threshold as "100" is too small, I think. Even if you can't use spinlock, please use mutex. (with checking gfp_mask). Thanks, -Kame > + > +static void mem_cgroup_invalidate_thresholds(struct mem_cgroup *memcg, > + bool swap) > +{ > + struct list_head *thresholds; > + struct mem_cgroup_threshold **below, **above; > + u64 usage = mem_cgroup_usage(memcg, swap); > + > + if (!swap) { > + thresholds = &memcg->thresholds; > + above = &memcg->above_threshold; > + below = &memcg->below_threshold; > + } else { > + thresholds = &memcg->memsw_thresholds; > + above = &memcg->memsw_above_threshold; > + below = &memcg->memsw_below_threshold; > + } > + > + *below = NULL; > + list_for_each_entry((*above), thresholds, list) { > + if ((*above)->threshold > usage) { > + BUG_ON(!*below); > + break; > + } > + *below = *above; > + } > +} > + > +static inline struct mem_cgroup_threshold *mem_cgroup_threshold_alloc(void) > +{ > + struct mem_cgroup_threshold *new; > + > + new = kmalloc(sizeof(*new), GFP_KERNEL); > + if (!new) > + return NULL; > + INIT_LIST_HEAD(&new->list); > + > + return new; > +} > + > +static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, > + struct eventfd_ctx *eventfd, const char *args) > +{ > + u64 threshold; > + struct mem_cgroup_threshold *new, *tmp; > + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); > + struct list_head *thresholds; > + int type = MEMFILE_TYPE(cft->private); > + int ret; > + > + ret = res_counter_memparse_write_strategy(args, &threshold); > + if (ret) > + return ret; > + > + new = mem_cgroup_threshold_alloc(); > + if (!new) > + return -ENOMEM; > + new->eventfd = eventfd; > + new->threshold = threshold; > + > + if (type == _MEM) > + thresholds = &memcg->thresholds; > + else if (type == _MEMSWAP) > + thresholds = &memcg->memsw_thresholds; > + else > + BUG(); > + > + /* Check if a threshold crossed before adding a new one */ > + mem_cgroup_threshold(memcg, type == _MEMSWAP); > + > + spin_lock(&memcg->thresholds_lock); > + list_for_each_entry(tmp, thresholds, list) > + if (new->threshold < tmp->threshold) { > + list_add_tail(&new->list, &tmp->list); > + break; > + } > + mem_cgroup_invalidate_thresholds(memcg, type == _MEMSWAP); > + spin_unlock(&memcg->thresholds_lock); > + > + return 0; > +} > + > +static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, > + struct eventfd_ctx *eventfd) > +{ > + struct mem_cgroup_threshold *threshold, *tmp; > + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); > + struct list_head *thresholds; > + int type = MEMFILE_TYPE(cft->private); > + > + if (type == _MEM) > + thresholds = &memcg->thresholds; > + else if (type == _MEMSWAP) > + thresholds = &memcg->memsw_thresholds; > + else > + BUG(); > + > + /* Check if a threshold crossed before adding a new one */ > + mem_cgroup_threshold(memcg, type == _MEMSWAP); > + > + spin_lock(&memcg->thresholds_lock); > + list_for_each_entry_safe(threshold, tmp, thresholds, list) > + if (threshold->eventfd == eventfd) { > + list_del(&threshold->list); > + kfree(threshold); > + } > + mem_cgroup_invalidate_thresholds(memcg, type == _MEMSWAP); > + spin_unlock(&memcg->thresholds_lock); > + > + return 0; > +} > + > static struct cftype mem_cgroup_files[] = { > { > .name = "usage_in_bytes", > .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), > .read_u64 = mem_cgroup_read, > + .register_event = mem_cgroup_register_event, > + .unregister_event = mem_cgroup_unregister_event, > }, > { > .name = "max_usage_in_bytes", > @@ -2916,6 +3128,8 @@ static struct cftype memsw_cgroup_files[] = { > .name = "memsw.usage_in_bytes", > .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), > .read_u64 = mem_cgroup_read, > + .register_event = mem_cgroup_register_event, > + .unregister_event = mem_cgroup_unregister_event, > }, > { > .name = "memsw.max_usage_in_bytes", > @@ -2990,6 +3204,48 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) > kfree(mem->info.nodeinfo[node]); > } > > +static int mem_cgroup_thresholds_init(struct mem_cgroup *mem) > +{ > + INIT_LIST_HEAD(&mem->thresholds); > + INIT_LIST_HEAD(&mem->memsw_thresholds); > + > + mem->below_threshold = mem_cgroup_threshold_alloc(); > + list_add(&mem->below_threshold->list, &mem->thresholds); > + mem->below_threshold->threshold = 0ULL; > + > + mem->above_threshold = mem_cgroup_threshold_alloc(); > + list_add_tail(&mem->above_threshold->list, &mem->thresholds); > + mem->above_threshold->threshold = RESOURCE_MAX; > + > + mem->memsw_below_threshold = mem_cgroup_threshold_alloc(); > + list_add(&mem->memsw_below_threshold->list, &mem->memsw_thresholds); > + mem->memsw_below_threshold->threshold = 0ULL; > + > + mem->memsw_above_threshold = mem_cgroup_threshold_alloc(); > + list_add_tail(&mem->memsw_above_threshold->list, &mem->memsw_thresholds); > + mem->memsw_above_threshold->threshold = RESOURCE_MAX; > + > + return 0; > +} > + > +static void mem_cgroup_thresholds_free(struct mem_cgroup *mem) > +{ > + /* Make sure that lists have only two elements */ > + BUG_ON((mem->below_threshold) && > + (mem->above_threshold) && > + (mem->below_threshold->list.next != > + &mem->above_threshold->list)); > + BUG_ON((mem->memsw_below_threshold) && > + (mem->memsw_above_threshold) && > + (mem->below_threshold->list.next != > + &mem->above_threshold->list)); > + > + kfree(mem->below_threshold); > + kfree(mem->above_threshold); > + kfree(mem->memsw_below_threshold); > + kfree(mem->memsw_above_threshold); > +} > + > static int mem_cgroup_size(void) > { > int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu); > @@ -3026,6 +3282,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) > { > int node; > > + mem_cgroup_thresholds_free(mem); > + > mem_cgroup_remove_from_trees(mem); > free_css_id(&mem_cgroup_subsys, &mem->css); > > @@ -3145,6 +3403,11 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) > mem->last_scanned_child = 0; > spin_lock_init(&mem->reclaim_param_lock); > > + spin_lock_init(&mem->thresholds_lock); > + error = mem_cgroup_thresholds_init(mem); > + if (error) > + goto free_out; > + > if (parent) > mem->swappiness = get_swappiness(parent); > atomic_set(&mem->refcnt, 1); > -- > 1.6.5.3 > > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/