Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1758476AbZLPBov (ORCPT ); Tue, 15 Dec 2009 20:44:51 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1755372AbZLPBou (ORCPT ); Tue, 15 Dec 2009 20:44:50 -0500 Received: from cn.fujitsu.com ([222.73.24.84]:54176 "EHLO song.cn.fujitsu.com" rhost-flags-OK-FAIL-OK-OK) by vger.kernel.org with ESMTP id S1754870AbZLPBot (ORCPT ); Tue, 15 Dec 2009 20:44:49 -0500 Message-ID: <4B283B7F.2050403@cn.fujitsu.com> Date: Wed, 16 Dec 2009 09:44:31 +0800 From: Li Zefan User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1b3pre) Gecko/20090513 Fedora/3.0-2.3.beta2.fc11 Thunderbird/3.0b2 MIME-Version: 1.0 To: "Kirill A. Shutemov" CC: containers@lists.linux-foundation.org, linux-mm@kvack.org, Paul Menage , Andrew Morton , KAMEZAWA Hiroyuki , Balbir Singh , Pavel Emelyanov , Dan Malek , Vladislav Buzov , Daisuke Nishimura , linux-kernel@vger.kernel.org Subject: Re: [PATCH RFC v2 1/4] cgroup: implement eventfd-based generic API for notifications References: In-Reply-To: Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 10459 Lines: 385 Kirill A. Shutemov wrote: > This patch introduces write-only file "cgroup.event_control" in every > cgroup. > > To register new notification handler you need: > - create an eventfd; > - open a control file to be monitored. Callbacks register_event() and > unregister_event() must be defined for the control file; > - write " " to cgroup.event_control. > Interpretation of args is defined by control file implementation; > > eventfd will be woken up by control file implementation or when the > cgroup is removed. > > To unregister notification handler just close eventfd. > > If you need notification functionality for a control file you have to > implement callbacks register_event() and unregister_event() in the > struct cftype. > > Signed-off-by: Kirill A. Shutemov > --- > include/linux/cgroup.h | 20 +++++ > kernel/cgroup.c | 215 +++++++++++++++++++++++++++++++++++++++++++++++- > 2 files changed, 234 insertions(+), 1 deletions(-) > > diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h > index 0008dee..7ad3078 100644 > --- a/include/linux/cgroup.h > +++ b/include/linux/cgroup.h > @@ -220,6 +220,10 @@ struct cgroup { > > /* For RCU-protected deletion */ > struct rcu_head rcu_head; > + > + /* List of events which userspace want to recieve */ > + struct list_head event_list; > + struct mutex event_list_mutex; > }; > > /* > @@ -362,6 +366,22 @@ struct cftype { > int (*trigger)(struct cgroup *cgrp, unsigned int event); > > int (*release)(struct inode *inode, struct file *file); > + > + /* > + * register_event() callback will be used to add new userspace > + * waiter for changes related to the cftype. Implement it if > + * you want to provide this functionality. Use eventfd_signal() > + * on eventfd to send notification to userspace. > + */ > + int (*register_event)(struct cgroup *cgrp, struct cftype *cft, > + struct eventfd_ctx *eventfd, const char *args); > + /* > + * unregister_event() callback will be called when userspace > + * close the eventfd. This callback must be implemented, if you > + * provide register_event(). > + */ > + int (*unregister_event)(struct cgroup *cgrp, struct cftype *cft, > + struct eventfd_ctx *eventfd); > }; > > struct cgroup_scanner { > diff --git a/kernel/cgroup.c b/kernel/cgroup.c > index 0249f4b..f7ec3ca 100644 > --- a/kernel/cgroup.c > +++ b/kernel/cgroup.c > @@ -4,6 +4,10 @@ > * Based originally on the cpuset system, extracted by Paul Menage > * Copyright (C) 2006 Google, Inc > * > + * Notifiactions support s/Notifiactions/Notifications > + * Copyright (C) 2009 Nokia Corporation > + * Author: Kirill A. Shutemov > + * > * Copyright notices from the original cpuset code: > * -------------------------------------------------- > * Copyright (C) 2003 BULL SA. > @@ -51,6 +55,8 @@ > #include > #include > #include /* TODO: replace with more sophisticated array */ > +#include > +#include > > #include > > @@ -146,6 +152,36 @@ struct css_id { > unsigned short stack[0]; /* Array of Length (depth+1) */ > }; > > +/* > + * cgroup_event represents event which userspace want to recieve. s/event/events ? > + */ > +struct cgroup_event { > + /* > + * Cgroup which the event belongs to. > + */ > + struct cgroup *cgrp; > + /* > + * Control file which the event associated. > + */ > + struct cftype *cft; > + /* > + * eventfd to signal userspace about the event. > + */ > + struct eventfd_ctx *eventfd; > + /* > + * Each of these stored in a list by the cgroup. > + */ > + struct list_head list; > + /* > + * All fields below needed to unregister event when > + * userspace closes eventfd. > + */ > + poll_table pt; > + wait_queue_head_t *wqh; > + wait_queue_t wait; > + struct work_struct remove; > +}; Please add a blank line here. > +static void cgroup_event_remove(struct cgroup_event *event); > > /* The list of hierarchy roots */ > > @@ -734,14 +770,29 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) > static int cgroup_call_pre_destroy(struct cgroup *cgrp) > { > struct cgroup_subsys *ss; > + struct cgroup_event *event, *tmp; > int ret = 0; > > for_each_subsys(cgrp->root, ss) > if (ss->pre_destroy) { > ret = ss->pre_destroy(ss, cgrp); > if (ret) > - break; > + goto out; > } > + > + /* > + * Unregister events and notify userspace. > + * FIXME: How to avoid race with cgroup_event_remove_work() > + * which runs from workqueue? > + */ > + mutex_lock(&cgrp->event_list_mutex); > + list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { > + cgroup_event_remove(event); > + eventfd_signal(event->eventfd, 1); How can you access event after you kfree()ed it in cgroup_event_remove()? > + } > + mutex_unlock(&cgrp->event_list_mutex); > + > +out: > return ret; > } > > @@ -1136,6 +1187,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) > INIT_LIST_HEAD(&cgrp->release_list); > INIT_LIST_HEAD(&cgrp->pidlists); > mutex_init(&cgrp->pidlist_mutex); > + INIT_LIST_HEAD(&cgrp->event_list); > + mutex_init(&cgrp->event_list_mutex); > } > > static void init_cgroup_root(struct cgroupfs_root *root) > @@ -1935,6 +1988,16 @@ static const struct inode_operations cgroup_dir_inode_operations = { > .rename = cgroup_rename, > }; > > +/* > + * Check if a file is a control file > + */ > +static inline struct cftype *__file_cft(struct file *file) > +{ > + if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) > + return ERR_PTR(-EINVAL); I don't think this check is needed. > + return __d_cft(file->f_dentry); > +} > + > static int cgroup_create_file(struct dentry *dentry, mode_t mode, > struct super_block *sb) > { > @@ -2789,6 +2852,151 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp, > return 0; > } > > +static inline void cgroup_event_remove(struct cgroup_event *event) > +{ > + struct cgroup *cgrp = event->cgrp; > + > + BUG_ON(event->cft->unregister_event(cgrp, event->cft, event->eventfd)); > + eventfd_ctx_put(event->eventfd); > + remove_wait_queue(event->wqh, &event->wait); > + list_del(&event->list); > + kfree(event); > +} > + > +static void cgroup_event_remove_work(struct work_struct *work) > +{ > + struct cgroup_event *event = container_of(work, struct cgroup_event, > + remove); > + struct cgroup *cgrp = event->cgrp; > + > + mutex_lock(&cgrp->event_list_mutex); > + cgroup_event_remove(event); > + mutex_unlock(&cgrp->event_list_mutex); > +} > + > +static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, > + int sync, void *key) > +{ > + struct cgroup_event *event = container_of(wait, > + struct cgroup_event, wait); > + unsigned long flags = (unsigned long)key; > + > + if (flags & POLLHUP) > + /* > + * This function called with spinlock taken, but s/called/is called/ ? > + * cgroup_event_remove() may sleep, so we have > + * to run it in a workqueue. > + */ > + schedule_work(&event->remove); Please use: if (...) { ... } > + > + return 0; > +} > + > +static void cgroup_event_ptable_queue_proc(struct file *file, > + wait_queue_head_t *wqh, poll_table *pt) > +{ > + struct cgroup_event *event = container_of(pt, > + struct cgroup_event, pt); > + > + event->wqh = wqh; > + add_wait_queue(wqh, &event->wait); > +} > + > +static int cgroup_write_event_control(struct cgroup *cont, struct cftype *cft, Please consistently use "cgrp" > + const char *buffer) > +{ > + struct cgroup_event *event = NULL; > + unsigned int efd, cfd; > + struct file *efile = NULL; > + struct file *cfile = NULL; > + char *endp; > + int ret; > + > + efd = simple_strtoul(buffer, &endp, 10); > + if (*endp != ' ') > + return -EINVAL; > + buffer = endp + 1; > + > + cfd = simple_strtoul(buffer, &endp, 10); > + if ((*endp != ' ') && (*endp != '\0')) > + return -EINVAL; > + buffer = endp + 1; > + > + event = kzalloc(sizeof(*event), GFP_KERNEL); > + if (!event) > + return -ENOMEM; > + event->cgrp = cont; > + INIT_LIST_HEAD(&event->list); > + init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); > + init_waitqueue_func_entry(&event->wait, cgroup_event_wake); > + INIT_WORK(&event->remove, cgroup_event_remove_work); > + > + efile = eventfd_fget(efd); > + if (IS_ERR(efile)) { > + ret = PTR_ERR(efile); > + goto fail; > + } > + > + event->eventfd = eventfd_ctx_fileget(efile); > + if (IS_ERR(event->eventfd)) { > + ret = PTR_ERR(event->eventfd); > + goto fail; > + } > + > + cfile = fget(cfd); > + if (!cfile) { > + ret = -EBADF; > + goto fail; > + } > + > + /* the process need read permission on control file */ > + ret = file_permission(cfile, MAY_READ); > + if (ret < 0) > + goto fail; > + > + event->cft = __file_cft(cfile); > + if (IS_ERR(event->cft)) { > + ret = PTR_ERR(event->cft); > + goto fail; > + } > + > + if (!event->cft->register_event || !event->cft->unregister_event) { > + ret = -EINVAL; > + goto fail; > + } > + > + ret = event->cft->register_event(cont, event->cft, > + event->eventfd, buffer); > + if (ret) > + goto fail; > + > + efile->f_op->poll(efile, &event->pt); > + > + mutex_lock(&cont->event_list_mutex); > + list_add(&event->list, &cont->event_list); > + mutex_unlock(&cont->event_list_mutex); > + > + fput(cfile); > + fput(efile); > + > + return 0; > + > +fail: > + if (!IS_ERR(cfile)) > + fput(cfile); > + > + if (event && event->eventfd && !IS_ERR(event->eventfd)) > + eventfd_ctx_put(event->eventfd); > + > + if (!IS_ERR(efile)) > + fput(efile); > + > + if (event) > + kfree(event); kfree(NULL) is ok > + > + return ret; > +} > + > /* > * for the common functions, 'private' gives the type of file > */ > @@ -2814,6 +3022,11 @@ static struct cftype files[] = { > .read_u64 = cgroup_read_notify_on_release, > .write_u64 = cgroup_write_notify_on_release, > }, > + { > + .name = CGROUP_FILE_GENERIC_PREFIX "event_control", > + .write_string = cgroup_write_event_control, > + .mode = S_IWUGO, We want this file to be writable to everyone ? > + }, > }; > > static struct cftype cft_release_agent = { -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/