Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1758480AbZLPFqt (ORCPT ); Wed, 16 Dec 2009 00:46:49 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1754196AbZLPFqs (ORCPT ); Wed, 16 Dec 2009 00:46:48 -0500 Received: from mail-yx0-f187.google.com ([209.85.210.187]:59325 "EHLO mail-yx0-f187.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753944AbZLPFqr convert rfc822-to-8bit (ORCPT ); Wed, 16 Dec 2009 00:46:47 -0500 MIME-Version: 1.0 In-Reply-To: <4B283B7F.2050403@cn.fujitsu.com> References: <4B283B7F.2050403@cn.fujitsu.com> Date: Wed, 16 Dec 2009 07:46:46 +0200 Message-ID: Subject: Re: [PATCH RFC v2 1/4] cgroup: implement eventfd-based generic API for notifications From: "Kirill A. Shutemov" To: Li Zefan Cc: containers@lists.linux-foundation.org, linux-mm@kvack.org, Paul Menage , Andrew Morton , KAMEZAWA Hiroyuki , Balbir Singh , Pavel Emelyanov , Dan Malek , Vladislav Buzov , Daisuke Nishimura , linux-kernel@vger.kernel.org Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8BIT Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 13014 Lines: 408 On Wed, Dec 16, 2009 at 3:44 AM, Li Zefan wrote: > Kirill A. Shutemov wrote: >> This patch introduces write-only file "cgroup.event_control" in every >> cgroup. >> >> To register new notification handler you need: >> - create an eventfd; >> - open a control file to be monitored. Callbacks register_event() and >> unregister_event() must be defined for the control file; >> - write " " to cgroup.event_control. >> Interpretation of args is defined by control file implementation; >> >> eventfd will be woken up by control file implementation or when the >> cgroup is removed. >> >> To unregister notification handler just close eventfd. >> >> If you need notification functionality for a control file you have to >> implement callbacks register_event() and unregister_event() in the >> struct cftype. >> >> Signed-off-by: Kirill A. Shutemov >> --- >> include/linux/cgroup.h | 20 +++++ >> kernel/cgroup.c | 215 +++++++++++++++++++++++++++++++++++++++++++++++- >> 2 files changed, 234 insertions(+), 1 deletions(-) >> >> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h >> index 0008dee..7ad3078 100644 >> --- a/include/linux/cgroup.h >> +++ b/include/linux/cgroup.h >> @@ -220,6 +220,10 @@ struct cgroup { >> >> /* For RCU-protected deletion */ >> struct rcu_head rcu_head; >> + >> + /* List of events which userspace want to recieve */ >> + struct list_head event_list; >> + struct mutex event_list_mutex; >> }; >> >> /* >> @@ -362,6 +366,22 @@ struct cftype { >> int (*trigger)(struct cgroup *cgrp, unsigned int event); >> >> int (*release)(struct inode *inode, struct file *file); >> + >> + /* >> + * register_event() callback will be used to add new userspace >> + * waiter for changes related to the cftype. Implement it if >> + * you want to provide this functionality. Use eventfd_signal() >> + * on eventfd to send notification to userspace. >> + */ >> + int (*register_event)(struct cgroup *cgrp, struct cftype *cft, >> + struct eventfd_ctx *eventfd, const char *args); >> + /* >> + * unregister_event() callback will be called when userspace >> + * close the eventfd. This callback must be implemented, if you >> + * provide register_event(). >> + */ >> + int (*unregister_event)(struct cgroup *cgrp, struct cftype *cft, >> + struct eventfd_ctx *eventfd); >> }; >> >> struct cgroup_scanner { >> diff --git a/kernel/cgroup.c b/kernel/cgroup.c >> index 0249f4b..f7ec3ca 100644 >> --- a/kernel/cgroup.c >> +++ b/kernel/cgroup.c >> @@ -4,6 +4,10 @@ >> * Based originally on the cpuset system, extracted by Paul Menage >> * Copyright (C) 2006 Google, Inc >> * >> + * Notifiactions support > > s/Notifiactions/Notifications Thanks. >> + * Copyright (C) 2009 Nokia Corporation >> + * Author: Kirill A. Shutemov >> + * >> * Copyright notices from the original cpuset code: >> * -------------------------------------------------- >> * Copyright (C) 2003 BULL SA. >> @@ -51,6 +55,8 @@ >> #include >> #include >> #include /* TODO: replace with more sophisticated array */ >> +#include >> +#include >> >> #include >> >> @@ -146,6 +152,36 @@ struct css_id { >> unsigned short stack[0]; /* Array of Length (depth+1) */ >> }; >> >> +/* >> + * cgroup_event represents event which userspace want to recieve. > > s/event/events ? Thanks. > >> + */ >> +struct cgroup_event { >> + /* >> + * Cgroup which the event belongs to. >> + */ >> + struct cgroup *cgrp; >> + /* >> + * Control file which the event associated. >> + */ >> + struct cftype *cft; >> + /* >> + * eventfd to signal userspace about the event. >> + */ >> + struct eventfd_ctx *eventfd; >> + /* >> + * Each of these stored in a list by the cgroup. >> + */ >> + struct list_head list; >> + /* >> + * All fields below needed to unregister event when >> + * userspace closes eventfd. >> + */ >> + poll_table pt; >> + wait_queue_head_t *wqh; >> + wait_queue_t wait; >> + struct work_struct remove; >> +}; > > Please add a blank line here. Ok. >> +static void cgroup_event_remove(struct cgroup_event *event); >> >> /* The list of hierarchy roots */ >> >> @@ -734,14 +770,29 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) >> static int cgroup_call_pre_destroy(struct cgroup *cgrp) >> { >> struct cgroup_subsys *ss; >> + struct cgroup_event *event, *tmp; >> int ret = 0; >> >> for_each_subsys(cgrp->root, ss) >> if (ss->pre_destroy) { >> ret = ss->pre_destroy(ss, cgrp); >> if (ret) >> - break; >> + goto out; >> } >> + >> + /* >> + * Unregister events and notify userspace. >> + * FIXME: How to avoid race with cgroup_event_remove_work() >> + * which runs from workqueue? >> + */ >> + mutex_lock(&cgrp->event_list_mutex); >> + list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { >> + cgroup_event_remove(event); >> + eventfd_signal(event->eventfd, 1); > > How can you access event after you kfree()ed it in cgroup_event_remove()? Nice catch. Thank you. >> + } >> + mutex_unlock(&cgrp->event_list_mutex); >> + >> +out: >> return ret; >> } >> >> @@ -1136,6 +1187,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) >> INIT_LIST_HEAD(&cgrp->release_list); >> INIT_LIST_HEAD(&cgrp->pidlists); >> mutex_init(&cgrp->pidlist_mutex); >> + INIT_LIST_HEAD(&cgrp->event_list); >> + mutex_init(&cgrp->event_list_mutex); >> } >> >> static void init_cgroup_root(struct cgroupfs_root *root) >> @@ -1935,6 +1988,16 @@ static const struct inode_operations cgroup_dir_inode_operations = { >> .rename = cgroup_rename, >> }; >> >> +/* >> + * Check if a file is a control file >> + */ >> +static inline struct cftype *__file_cft(struct file *file) >> +{ >> + if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) >> + return ERR_PTR(-EINVAL); > > I don't think this check is needed. > >> + return __d_cft(file->f_dentry); >> +} >> + >> static int cgroup_create_file(struct dentry *dentry, mode_t mode, >> struct super_block *sb) >> { >> @@ -2789,6 +2852,151 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp, >> return 0; >> } >> >> +static inline void cgroup_event_remove(struct cgroup_event *event) >> +{ >> + struct cgroup *cgrp = event->cgrp; >> + >> + BUG_ON(event->cft->unregister_event(cgrp, event->cft, event->eventfd)); >> + eventfd_ctx_put(event->eventfd); >> + remove_wait_queue(event->wqh, &event->wait); >> + list_del(&event->list); >> + kfree(event); >> +} >> + >> +static void cgroup_event_remove_work(struct work_struct *work) >> +{ >> + struct cgroup_event *event = container_of(work, struct cgroup_event, >> + remove); >> + struct cgroup *cgrp = event->cgrp; >> + >> + mutex_lock(&cgrp->event_list_mutex); >> + cgroup_event_remove(event); >> + mutex_unlock(&cgrp->event_list_mutex); >> +} >> + >> +static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, >> + int sync, void *key) >> +{ >> + struct cgroup_event *event = container_of(wait, >> + struct cgroup_event, wait); >> + unsigned long flags = (unsigned long)key; >> + >> + if (flags & POLLHUP) >> + /* >> + * This function called with spinlock taken, but > > s/called/is called/ ? Ok. >> + * cgroup_event_remove() may sleep, so we have >> + * to run it in a workqueue. >> + */ >> + schedule_work(&event->remove); > > Please use: > > if (...) { > ... > } Ok. >> + >> + return 0; >> +} >> + >> +static void cgroup_event_ptable_queue_proc(struct file *file, >> + wait_queue_head_t *wqh, poll_table *pt) >> +{ >> + struct cgroup_event *event = container_of(pt, >> + struct cgroup_event, pt); >> + >> + event->wqh = wqh; >> + add_wait_queue(wqh, &event->wait); >> +} >> + >> +static int cgroup_write_event_control(struct cgroup *cont, struct cftype *cft, > > Please consistently use "cgrp" Ok. >> + const char *buffer) >> +{ >> + struct cgroup_event *event = NULL; >> + unsigned int efd, cfd; >> + struct file *efile = NULL; >> + struct file *cfile = NULL; >> + char *endp; >> + int ret; >> + >> + efd = simple_strtoul(buffer, &endp, 10); >> + if (*endp != ' ') >> + return -EINVAL; >> + buffer = endp + 1; >> + >> + cfd = simple_strtoul(buffer, &endp, 10); >> + if ((*endp != ' ') && (*endp != '\0')) >> + return -EINVAL; >> + buffer = endp + 1; >> + >> + event = kzalloc(sizeof(*event), GFP_KERNEL); >> + if (!event) >> + return -ENOMEM; >> + event->cgrp = cont; >> + INIT_LIST_HEAD(&event->list); >> + init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); >> + init_waitqueue_func_entry(&event->wait, cgroup_event_wake); >> + INIT_WORK(&event->remove, cgroup_event_remove_work); >> + >> + efile = eventfd_fget(efd); >> + if (IS_ERR(efile)) { >> + ret = PTR_ERR(efile); >> + goto fail; >> + } >> + >> + event->eventfd = eventfd_ctx_fileget(efile); >> + if (IS_ERR(event->eventfd)) { >> + ret = PTR_ERR(event->eventfd); >> + goto fail; >> + } >> + >> + cfile = fget(cfd); >> + if (!cfile) { >> + ret = -EBADF; >> + goto fail; >> + } >> + >> + /* the process need read permission on control file */ >> + ret = file_permission(cfile, MAY_READ); >> + if (ret < 0) >> + goto fail; >> + >> + event->cft = __file_cft(cfile); >> + if (IS_ERR(event->cft)) { >> + ret = PTR_ERR(event->cft); >> + goto fail; >> + } >> + >> + if (!event->cft->register_event || !event->cft->unregister_event) { >> + ret = -EINVAL; >> + goto fail; >> + } >> + >> + ret = event->cft->register_event(cont, event->cft, >> + event->eventfd, buffer); >> + if (ret) >> + goto fail; >> + >> + efile->f_op->poll(efile, &event->pt); >> + >> + mutex_lock(&cont->event_list_mutex); >> + list_add(&event->list, &cont->event_list); >> + mutex_unlock(&cont->event_list_mutex); >> + >> + fput(cfile); >> + fput(efile); >> + >> + return 0; >> + >> +fail: >> + if (!IS_ERR(cfile)) >> + fput(cfile); >> + >> + if (event && event->eventfd && !IS_ERR(event->eventfd)) >> + eventfd_ctx_put(event->eventfd); >> + >> + if (!IS_ERR(efile)) >> + fput(efile); >> + >> + if (event) >> + kfree(event); > > kfree(NULL) is ok Ok. >> + >> + return ret; >> +} >> + >> /* >> * for the common functions, 'private' gives the type of file >> */ >> @@ -2814,6 +3022,11 @@ static struct cftype files[] = { >> .read_u64 = cgroup_read_notify_on_release, >> .write_u64 = cgroup_write_notify_on_release, >> }, >> + { >> + .name = CGROUP_FILE_GENERIC_PREFIX "event_control", >> + .write_string = cgroup_write_event_control, >> + .mode = S_IWUGO, > > We want this file to be writable to everyone ? Yes. We check permission of the file which we want to track. >> + }, >> }; >> >> static struct cftype cft_release_agent = { > Thank you. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/