LinuxLists.cc - [PATCH - 2.6.15-rc5-mm3] Allow sysfs attribute files to be pollable.

2005-12-20 23:14:48

Subject: [PATCH - 2.6.15-rc5-mm3] Allow sysfs attribute files to be pollable.

I suggested an early of this patch some time ago to see if it was an
acceptable approach and got zero feedback, which presumably means it
is perfect:-)

I've now reviewed it, fixed up the bits I didn't like, and tested it.
It works and I am happy with in.

So: I would like to submit it for inclusion in a future kernel.

Comments, or acks, please :-)

NeilBrown

---------
This allows an attribute file in sysfs to be polled for activity.

It works like this:
Open the file
Read all the contents.
Call poll requesting POLLERR or POLLPRI (so select/exceptfds works)
When poll returns, close the file, and go to top of loop.

Events are signaled by an object manager calling
sysfs_notify(kobj, dir, attr);

If the dir is non-NULL, it is used to find a subdirectory which
contains the attribute (presumably created by sysfs_create_group).

This has a cost of one int per attribute, one wait_queuehead per kobject,
one int per open file.

The name "sysfs_notify" may be confused with the inotify
functionality. Maybe it would be nice to support inotify for sysfs
attributes as well?

This patch also uses sysfs_notify to allow /sys/block/md*/md/sync_action
to be pollable

Signed-off-by: Neil Brown <[email protected]>

### Diffstat output
./drivers/md/md.c | 1
./fs/sysfs/dir.c | 1
./fs/sysfs/file.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++
./fs/sysfs/inode.c | 20 +++++++++++++++++++
./fs/sysfs/sysfs.h | 1
./include/linux/kobject.h | 2 +
./include/linux/sysfs.h | 7 ++++++
./lib/kobject.c | 1
8 files changed, 80 insertions(+)

diff ./drivers/md/md.c~current~ ./drivers/md/md.c
--- ./drivers/md/md.c~current~ 2005-12-21 09:42:07.000000000 +1100
+++ ./drivers/md/md.c 2005-12-21 09:43:52.000000000 +1100
@@ -162,6 +162,7 @@ void md_new_event(mddev_t *mddev)
{
atomic_inc(&md_event_count);
wake_up(&md_event_waiters);
+ sysfs_notify(&mddev->kobj, NULL, "sync_action");
}

/*

diff ./fs/sysfs/dir.c~current~ ./fs/sysfs/dir.c
--- ./fs/sysfs/dir.c~current~ 2005-12-21 09:43:51.000000000 +1100
+++ ./fs/sysfs/dir.c 2005-12-21 09:43:52.000000000 +1100
@@ -43,6 +43,7 @@ static struct sysfs_dirent * sysfs_new_d

memset(sd, 0, sizeof(*sd));
atomic_set(&sd->s_count, 1);
+ atomic_set(&sd->s_event, 0);
INIT_LIST_HEAD(&sd->s_children);
list_add(&sd->s_sibling, &parent_sd->s_children);
sd->s_element = element;

diff ./fs/sysfs/file.c~current~ ./fs/sysfs/file.c
--- ./fs/sysfs/file.c~current~ 2005-12-21 09:43:51.000000000 +1100
+++ ./fs/sysfs/file.c 2005-12-21 09:44:28.000000000 +1100
@@ -7,6 +7,7 @@
#include <linux/kobject.h>
#include <linux/namei.h>
#include <linux/limits.h>
+#include <linux/poll.h>

#include <asm/uaccess.h>
#include <asm/semaphore.h>
@@ -59,6 +60,7 @@ struct sysfs_buffer {
struct sysfs_ops * ops;
struct semaphore sem;
int needs_read_fill;
+ int event;
};

@@ -74,6 +76,7 @@ struct sysfs_buffer {
*/
static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer)
{
+ struct sysfs_dirent * sd = dentry->d_fsdata;
struct attribute * attr = to_attr(dentry);
struct kobject * kobj = to_kobj(dentry->d_parent);
struct sysfs_ops * ops = buffer->ops;
@@ -85,6 +88,7 @@ static int fill_read_buffer(struct dentr
if (!buffer->page)
return -ENOMEM;

+ buffer->event = atomic_read(&sd->s_event);
count = ops->show(kobj,attr,buffer->page);
buffer->needs_read_fill = 0;
BUG_ON(count > (ssize_t)PAGE_SIZE);
@@ -357,12 +361,55 @@ static int sysfs_release(struct inode *
return 0;
}

+/* Sysfs attribute files are pollable. The idea is that you read
+ * the content and then you use 'poll' or 'select' to wait for
+ * the content to change. When the content changes (assuming the
+ * manager for the kobject supports notification), poll will
+ * return POLLERR|POLLPRI, and select will return the fd whether
+ * it is waiting for read, write, or exceptions.
+ * Once poll/select indicates that the value has changed, you
+ * need to close and re-open the file, as simply seeking and reading
+ * again will not get new data, or reset the state of 'poll'.
+ * Reminder: this only works for attributes which actively support
+ * it, and it is not possible to test an attribute from userspace
+ * to see if it supports poll.
+ */
+static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
+{
+ struct sysfs_buffer * buffer = filp->private_data;
+ struct kobject * kobj = to_kobj(filp->f_dentry->d_parent);
+ struct sysfs_dirent * sd = filp->f_dentry->d_fsdata;
+ int res = 0;
+
+ poll_wait(filp, &kobj->poll, wait);
+
+ if (buffer->event != atomic_read(&sd->s_event))
+ res = POLLERR|POLLPRI;
+
+ return res;
+}
+
+void sysfs_notify(struct kobject * k, char *dir, char *attr)
+{
+ struct sysfs_dirent * sd = k->dentry->d_fsdata;
+ if (sd && dir)
+ sd = sysfs_find(sd, dir);
+ if (sd && attr)
+ sd = sysfs_find(sd, attr);
+ if (sd) {
+ atomic_inc(&sd->s_event);
+ wake_up_interruptible(&k->poll);
+ }
+}
+EXPORT_SYMBOL_GPL(sysfs_notify);
+
struct file_operations sysfs_file_operations = {
.read = sysfs_read_file,
.write = sysfs_write_file,
.llseek = generic_file_llseek,
.open = sysfs_open_file,
.release = sysfs_release,
+ .poll = sysfs_poll,
};

diff ./fs/sysfs/inode.c~current~ ./fs/sysfs/inode.c
--- ./fs/sysfs/inode.c~current~ 2005-12-21 09:43:51.000000000 +1100
+++ ./fs/sysfs/inode.c 2005-12-21 09:43:52.000000000 +1100
@@ -247,3 +247,23 @@ void sysfs_hash_and_remove(struct dentry
}

+struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name)
+{
+ struct sysfs_dirent * sd, * rv = NULL;
+
+ if (dir->s_dentry == NULL ||
+ dir->s_dentry->d_inode == NULL)
+ return NULL;
+
+ down(&dir->s_dentry->d_inode->i_sem);
+ list_for_each_entry(sd, &dir->s_children, s_sibling) {
+ if (!sd->s_element)
+ continue;
+ if (!strcmp(sysfs_get_name(sd), name)) {
+ rv = sd;
+ break;
+ }
+ }
+ up(&dir->s_dentry->d_inode->i_sem);
+ return rv;
+}

diff ./fs/sysfs/sysfs.h~current~ ./fs/sysfs/sysfs.h
--- ./fs/sysfs/sysfs.h~current~ 2005-12-21 09:43:51.000000000 +1100
+++ ./fs/sysfs/sysfs.h 2005-12-21 09:43:52.000000000 +1100
@@ -10,6 +10,7 @@ extern int sysfs_make_dirent(struct sysf

extern int sysfs_add_file(struct dentry *, const struct attribute *, int);
extern void sysfs_hash_and_remove(struct dentry * dir, const char * name);
+extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name);

extern int sysfs_create_subdir(struct kobject *, const char *, struct dentry **);
extern void sysfs_remove_subdir(struct dentry *);

diff ./include/linux/kobject.h~current~ ./include/linux/kobject.h
--- ./include/linux/kobject.h~current~ 2005-12-21 09:43:51.000000000 +1100
+++ ./include/linux/kobject.h 2005-12-21 09:43:52.000000000 +1100
@@ -24,6 +24,7 @@
#include <linux/rwsem.h>
#include <linux/kref.h>
#include <linux/kernel.h>
+#include <linux/wait.h>
#include <asm/atomic.h>

#define KOBJ_NAME_LEN 20
@@ -54,6 +55,7 @@ struct kobject {
struct kset * kset;
struct kobj_type * ktype;
struct dentry * dentry;
+ wait_queue_head_t poll;
};

extern int kobject_set_name(struct kobject *, const char *, ...)

diff ./include/linux/sysfs.h~current~ ./include/linux/sysfs.h
--- ./include/linux/sysfs.h~current~ 2005-12-21 09:43:52.000000000 +1100
+++ ./include/linux/sysfs.h 2005-12-21 09:43:52.000000000 +1100
@@ -74,6 +74,7 @@ struct sysfs_dirent {
umode_t s_mode;
struct dentry * s_dentry;
struct iattr * s_iattr;
+ atomic_t s_event;
};

#define SYSFS_ROOT 0x0001
@@ -118,6 +119,7 @@ int sysfs_remove_bin_file(struct kobject
int sysfs_create_group(struct kobject *, const struct attribute_group *);
void sysfs_remove_group(struct kobject *, const struct attribute_group *);

+void sysfs_notify(struct kobject * k, char *dir, char *attr);
#else /* CONFIG_SYSFS */

static inline int sysfs_create_dir(struct kobject * k)
@@ -185,6 +187,11 @@ static inline void sysfs_remove_group(st
;
}

+static inline void sysfs_notify(struct kobject * k, char *dir, char *attr)
+{
+ ;
+}
+
#endif /* CONFIG_SYSFS */

#endif /* _SYSFS_H_ */

diff ./lib/kobject.c~current~ ./lib/kobject.c
--- ./lib/kobject.c~current~ 2005-12-21 09:43:52.000000000 +1100
+++ ./lib/kobject.c 2005-12-21 09:43:52.000000000 +1100
@@ -124,6 +124,7 @@ void kobject_init(struct kobject * kobj)
{
kref_init(&kobj->kref);
INIT_LIST_HEAD(&kobj->entry);
+ init_waitqueue_head(&kobj->poll);
kobj->kset = kset_get(kobj->kset);
}

2005-12-20 23:28:53

by Jesper Juhl

[permalink] [raw]

Subject: Re: [PATCH - 2.6.15-rc5-mm3] Allow sysfs attribute files to be pollable.

On 12/21/05, Neil Brown <[email protected]> wrote:
>
> I suggested an early of this patch some time ago to see if it was an
> acceptable approach and got zero feedback, which presumably means it
> is perfect:-)
>
> I've now reviewed it, fixed up the bits I didn't like, and tested it.
> It works and I am happy with in.
>
> So: I would like to submit it for inclusion in a future kernel.
>
> Comments, or acks, please :-)
>
[snip]
> +/* Sysfs attribute files are pollable. The idea is that you read
> + * the content and then you use 'poll' or 'select' to wait for
> + * the content to change. When the content changes (assuming the
> + * manager for the kobject supports notification), poll will
> + * return POLLERR|POLLPRI, and select will return the fd whether
> + * it is waiting for read, write, or exceptions.
> + * Once poll/select indicates that the value has changed, you
> + * need to close and re-open the file, as simply seeking and reading
> + * again will not get new data, or reset the state of 'poll'.

What if the value changes again between me closing and re-opening the file?

> + * Reminder: this only works for attributes which actively support
> + * it, and it is not possible to test an attribute from userspace
> + * to see if it supports poll.
> + */

--
Jesper Juhl <[email protected]>
Don't top-post http://www.catb.org/~esr/jargon/html/T/top-post.html
Plain text mails only, please http://www.expita.com/nomime.html

2005-12-21 00:45:50

by NeilBrown

[permalink] [raw]

Subject: Re: [PATCH - 2.6.15-rc5-mm3] Allow sysfs attribute files to be pollable.

On Wednesday December 21, [email protected] wrote:
> On 12/21/05, Neil Brown <[email protected]> wrote:
> >
> > I suggested an early of this patch some time ago to see if it was an
> > acceptable approach and got zero feedback, which presumably means it
> > is perfect:-)
> >
> > I've now reviewed it, fixed up the bits I didn't like, and tested it.
> > It works and I am happy with in.
> >
> > So: I would like to submit it for inclusion in a future kernel.
> >
> > Comments, or acks, please :-)
> >
> [snip]
> > +/* Sysfs attribute files are pollable. The idea is that you read
> > + * the content and then you use 'poll' or 'select' to wait for
> > + * the content to change. When the content changes (assuming the
> > + * manager for the kobject supports notification), poll will
> > + * return POLLERR|POLLPRI, and select will return the fd whether
> > + * it is waiting for read, write, or exceptions.
> > + * Once poll/select indicates that the value has changed, you
> > + * need to close and re-open the file, as simply seeking and reading
> > + * again will not get new data, or reset the state of 'poll'.
>
> What if the value changes again between me closing and re-opening the file?

You miss an intermediate event I guess.

If you have a stream of events where you absolutely want to see every
one of them, then you want something like a character-device (or one
of several other alternative).
That is not what this is for.

However very often you don't need to see every single event. You just
need to know when state has changed so you can respond to the new
state.
That is what this patch is for.

Does that answer your question?

Thanks,
NeilBrown

2005-12-21 08:46:40

by Jesper Juhl

[permalink] [raw]

Subject: Re: [PATCH - 2.6.15-rc5-mm3] Allow sysfs attribute files to be pollable.

On 12/21/05, Neil Brown <[email protected]> wrote:
> On Wednesday December 21, [email protected] wrote:
> > On 12/21/05, Neil Brown <[email protected]> wrote:
> > >
> > > I suggested an early of this patch some time ago to see if it was an
> > > acceptable approach and got zero feedback, which presumably means it
> > > is perfect:-)
> > >
> > > I've now reviewed it, fixed up the bits I didn't like, and tested it.
> > > It works and I am happy with in.
> > >
> > > So: I would like to submit it for inclusion in a future kernel.
> > >
> > > Comments, or acks, please :-)
> > >
> > [snip]
> > > +/* Sysfs attribute files are pollable. The idea is that you read
> > > + * the content and then you use 'poll' or 'select' to wait for
> > > + * the content to change. When the content changes (assuming the
> > > + * manager for the kobject supports notification), poll will
> > > + * return POLLERR|POLLPRI, and select will return the fd whether
> > > + * it is waiting for read, write, or exceptions.
> > > + * Once poll/select indicates that the value has changed, you
> > > + * need to close and re-open the file, as simply seeking and reading
> > > + * again will not get new data, or reset the state of 'poll'.
> >
> > What if the value changes again between me closing and re-opening the file?
>
> You miss an intermediate event I guess.
>
> If you have a stream of events where you absolutely want to see every
> one of them, then you want something like a character-device (or one
> of several other alternative).
> That is not what this is for.
>
> However very often you don't need to see every single event. You just
> need to know when state has changed so you can respond to the new
> state.
> That is what this patch is for.
>
> Does that answer your question?
>
Yes it does. Thanks.

--
Jesper Juhl <[email protected]>
Don't top-post http://www.catb.org/~esr/jargon/html/T/top-post.html
Plain text mails only, please http://www.expita.com/nomime.html

2005-12-21 13:51:54

by Maneesh Soni

[permalink] [raw]

Subject: Re: [PATCH - 2.6.15-rc5-mm3] Allow sysfs attribute files to be pollable.

On Wed, Dec 21, 2005 at 10:14:29AM +1100, Neil Brown wrote:
>
> I suggested an early of this patch some time ago to see if it was an
> acceptable approach and got zero feedback, which presumably means it
> is perfect:-)
>
> I've now reviewed it, fixed up the bits I didn't like, and tested it.
> It works and I am happy with in.
>
> So: I would like to submit it for inclusion in a future kernel.
>
> Comments, or acks, please :-)
>
> NeilBrown
>
>
> ---------
> This allows an attribute file in sysfs to be polled for activity.
>
> It works like this:
> Open the file
> Read all the contents.
> Call poll requesting POLLERR or POLLPRI (so select/exceptfds works)
> When poll returns, close the file, and go to top of loop.
>

I am no "poll/select" expert, but is reading the contents always a
requirement for "poll"? If not then probably it is not a good idea to
put such rules.

> Events are signaled by an object manager calling
> sysfs_notify(kobj, dir, attr);
>
> If the dir is non-NULL, it is used to find a subdirectory which
> contains the attribute (presumably created by sysfs_create_group).
>
> This has a cost of one int per attribute, one wait_queuehead per kobject,
> one int per open file.
>
So, all the attribute files for a given kobject will use the same
wait queue? What happens if there are multiple attribute files
polled for the same kobject.

> The name "sysfs_notify" may be confused with the inotify
> functionality. Maybe it would be nice to support inotify for sysfs
> attributes as well?
>
> This patch also uses sysfs_notify to allow /sys/block/md*/md/sync_action
> to be pollable
>
> Signed-off-by: Neil Brown <[email protected]>
>
> ### Diffstat output
> ./drivers/md/md.c | 1
> ./fs/sysfs/dir.c | 1
> ./fs/sysfs/file.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++
> ./fs/sysfs/inode.c | 20 +++++++++++++++++++
> ./fs/sysfs/sysfs.h | 1
> ./include/linux/kobject.h | 2 +
> ./include/linux/sysfs.h | 7 ++++++
> ./lib/kobject.c | 1
> 8 files changed, 80 insertions(+)
>
> diff ./drivers/md/md.c~current~ ./drivers/md/md.c
> --- ./drivers/md/md.c~current~ 2005-12-21 09:42:07.000000000 +1100
> +++ ./drivers/md/md.c 2005-12-21 09:43:52.000000000 +1100
> @@ -162,6 +162,7 @@ void md_new_event(mddev_t *mddev)
> {
> atomic_inc(&md_event_count);
> wake_up(&md_event_waiters);
> + sysfs_notify(&mddev->kobj, NULL, "sync_action");
> }
>
> /*
>
> diff ./fs/sysfs/dir.c~current~ ./fs/sysfs/dir.c
> --- ./fs/sysfs/dir.c~current~ 2005-12-21 09:43:51.000000000 +1100
> +++ ./fs/sysfs/dir.c 2005-12-21 09:43:52.000000000 +1100
> @@ -43,6 +43,7 @@ static struct sysfs_dirent * sysfs_new_d
>
> memset(sd, 0, sizeof(*sd));
> atomic_set(&sd->s_count, 1);
> + atomic_set(&sd->s_event, 0);
> INIT_LIST_HEAD(&sd->s_children);
> list_add(&sd->s_sibling, &parent_sd->s_children);
> sd->s_element = element;
>
> diff ./fs/sysfs/file.c~current~ ./fs/sysfs/file.c
> --- ./fs/sysfs/file.c~current~ 2005-12-21 09:43:51.000000000 +1100
> +++ ./fs/sysfs/file.c 2005-12-21 09:44:28.000000000 +1100
> @@ -7,6 +7,7 @@
> #include <linux/kobject.h>
> #include <linux/namei.h>
> #include <linux/limits.h>
> +#include <linux/poll.h>
>
> #include <asm/uaccess.h>
> #include <asm/semaphore.h>
> @@ -59,6 +60,7 @@ struct sysfs_buffer {
> struct sysfs_ops * ops;
> struct semaphore sem;
> int needs_read_fill;
> + int event;
> };
>
>
> @@ -74,6 +76,7 @@ struct sysfs_buffer {
> */
> static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer)
> {
> + struct sysfs_dirent * sd = dentry->d_fsdata;
> struct attribute * attr = to_attr(dentry);
> struct kobject * kobj = to_kobj(dentry->d_parent);
> struct sysfs_ops * ops = buffer->ops;
> @@ -85,6 +88,7 @@ static int fill_read_buffer(struct dentr
> if (!buffer->page)
> return -ENOMEM;
>
> + buffer->event = atomic_read(&sd->s_event);
> count = ops->show(kobj,attr,buffer->page);
> buffer->needs_read_fill = 0;
> BUG_ON(count > (ssize_t)PAGE_SIZE);
> @@ -357,12 +361,55 @@ static int sysfs_release(struct inode *
> return 0;
> }
>
> +/* Sysfs attribute files are pollable. The idea is that you read
> + * the content and then you use 'poll' or 'select' to wait for
> + * the content to change. When the content changes (assuming the
> + * manager for the kobject supports notification), poll will
> + * return POLLERR|POLLPRI, and select will return the fd whether
> + * it is waiting for read, write, or exceptions.
> + * Once poll/select indicates that the value has changed, you
> + * need to close and re-open the file, as simply seeking and reading
> + * again will not get new data, or reset the state of 'poll'.
> + * Reminder: this only works for attributes which actively support
> + * it, and it is not possible to test an attribute from userspace
> + * to see if it supports poll.
> + */
> +static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
> +{
> + struct sysfs_buffer * buffer = filp->private_data;
> + struct kobject * kobj = to_kobj(filp->f_dentry->d_parent);
> + struct sysfs_dirent * sd = filp->f_dentry->d_fsdata;
> + int res = 0;
> +
> + poll_wait(filp, &kobj->poll, wait);
> +
> + if (buffer->event != atomic_read(&sd->s_event))
> + res = POLLERR|POLLPRI;
> +
> + return res;
> +}
> +
> +void sysfs_notify(struct kobject * k, char *dir, char *attr)
> +{
> + struct sysfs_dirent * sd = k->dentry->d_fsdata;
> + if (sd && dir)
> + sd = sysfs_find(sd, dir);
> + if (sd && attr)
> + sd = sysfs_find(sd, attr);
> + if (sd) {
> + atomic_inc(&sd->s_event);
> + wake_up_interruptible(&k->poll);
> + }
> +}
> +EXPORT_SYMBOL_GPL(sysfs_notify);
> +
> struct file_operations sysfs_file_operations = {
> .read = sysfs_read_file,
> .write = sysfs_write_file,
> .llseek = generic_file_llseek,
> .open = sysfs_open_file,
> .release = sysfs_release,
> + .poll = sysfs_poll,
> };
>
>
>
> diff ./fs/sysfs/inode.c~current~ ./fs/sysfs/inode.c
> --- ./fs/sysfs/inode.c~current~ 2005-12-21 09:43:51.000000000 +1100
> +++ ./fs/sysfs/inode.c 2005-12-21 09:43:52.000000000 +1100
> @@ -247,3 +247,23 @@ void sysfs_hash_and_remove(struct dentry
> }
>
>
> +struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name)
> +{
> + struct sysfs_dirent * sd, * rv = NULL;
> +
> + if (dir->s_dentry == NULL ||
> + dir->s_dentry->d_inode == NULL)
> + return NULL;
> +
> + down(&dir->s_dentry->d_inode->i_sem);
> + list_for_each_entry(sd, &dir->s_children, s_sibling) {
> + if (!sd->s_element)
> + continue;
> + if (!strcmp(sysfs_get_name(sd), name)) {
> + rv = sd;
> + break;
> + }
> + }
> + up(&dir->s_dentry->d_inode->i_sem);
> + return rv;
> +}

I think there might be some issues here, if some other thread wants to
remove the kobject, while this thread is trying to notify. Testing
with some parallel add/delete operations should tell.

One can pin the sysfs_dirent corresponding to the attribute file
by doing sysfs_get() but that might create/need more complication
related to parent sysfs_dirent and so on. Because as of now,
sysfs_dirents have 0, 1 or 2 as ref count. At the time of creation
it is 1, and at the time of linking with dentry it is 2. Once it
becomes 0, the sysfs_dirent is freed. This was just to keep the
refcounting simple and use the already available VFS dentry refcounting.

In this case IMO, the better option is to do just lookup_one_len(), get
the dentry and then get the sysfs_dirent as dentry->d_fsdata. And once
done, do dput() which will release the references taken.

I think its time to queue a sysfs locking document in my todo list ;-)

>
> diff ./fs/sysfs/sysfs.h~current~ ./fs/sysfs/sysfs.h
> --- ./fs/sysfs/sysfs.h~current~ 2005-12-21 09:43:51.000000000 +1100
> +++ ./fs/sysfs/sysfs.h 2005-12-21 09:43:52.000000000 +1100
> @@ -10,6 +10,7 @@ extern int sysfs_make_dirent(struct sysf
>
> extern int sysfs_add_file(struct dentry *, const struct attribute *, int);
> extern void sysfs_hash_and_remove(struct dentry * dir, const char * name);
> +extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name);
>
> extern int sysfs_create_subdir(struct kobject *, const char *, struct dentry **);
> extern void sysfs_remove_subdir(struct dentry *);
>
> diff ./include/linux/kobject.h~current~ ./include/linux/kobject.h
> --- ./include/linux/kobject.h~current~ 2005-12-21 09:43:51.000000000 +1100
> +++ ./include/linux/kobject.h 2005-12-21 09:43:52.000000000 +1100
> @@ -24,6 +24,7 @@
> #include <linux/rwsem.h>
> #include <linux/kref.h>
> #include <linux/kernel.h>
> +#include <linux/wait.h>
> #include <asm/atomic.h>
>
> #define KOBJ_NAME_LEN 20
> @@ -54,6 +55,7 @@ struct kobject {
> struct kset * kset;
> struct kobj_type * ktype;
> struct dentry * dentry;
> + wait_queue_head_t poll;
> };

>
> extern int kobject_set_name(struct kobject *, const char *, ...)
>
> diff ./include/linux/sysfs.h~current~ ./include/linux/sysfs.h
> --- ./include/linux/sysfs.h~current~ 2005-12-21 09:43:52.000000000 +1100
> +++ ./include/linux/sysfs.h 2005-12-21 09:43:52.000000000 +1100
> @@ -74,6 +74,7 @@ struct sysfs_dirent {
> umode_t s_mode;
> struct dentry * s_dentry;
> struct iattr * s_iattr;
> + atomic_t s_event;
> };
>
> #define SYSFS_ROOT 0x0001
> @@ -118,6 +119,7 @@ int sysfs_remove_bin_file(struct kobject
> int sysfs_create_group(struct kobject *, const struct attribute_group *);
> void sysfs_remove_group(struct kobject *, const struct attribute_group *);
>
> +void sysfs_notify(struct kobject * k, char *dir, char *attr);
> #else /* CONFIG_SYSFS */
>
> static inline int sysfs_create_dir(struct kobject * k)
> @@ -185,6 +187,11 @@ static inline void sysfs_remove_group(st
> ;
> }
>
> +static inline void sysfs_notify(struct kobject * k, char *dir, char *attr)
> +{
> + ;
> +}
> +
> #endif /* CONFIG_SYSFS */
>
> #endif /* _SYSFS_H_ */
>
> diff ./lib/kobject.c~current~ ./lib/kobject.c
> --- ./lib/kobject.c~current~ 2005-12-21 09:43:52.000000000 +1100
> +++ ./lib/kobject.c 2005-12-21 09:43:52.000000000 +1100
> @@ -124,6 +124,7 @@ void kobject_init(struct kobject * kobj)
> {
> kref_init(&kobj->kref);
> INIT_LIST_HEAD(&kobj->entry);
> + init_waitqueue_head(&kobj->poll);
> kobj->kset = kset_get(kobj->kset);
> }
>
>

--
Maneesh Soni
Linux Technology Center,
IBM India Software Labs,
Bangalore, India
email: [email protected]
Phone: 91-80-25044990

2005-12-22 03:43:27

by NeilBrown

[permalink] [raw]

Subject: Re: [PATCH - 2.6.15-rc5-mm3] Allow sysfs attribute files to be pollable.

On Wednesday December 21, [email protected] wrote:
> >
> > It works like this:
> > Open the file
> > Read all the contents.
> > Call poll requesting POLLERR or POLLPRI (so select/exceptfds works)
> > When poll returns, close the file, and go to top of loop.
> >
>
> I am no "poll/select" expert, but is reading the contents always a
> requirement for "poll"? If not then probably it is not a good idea to
> put such rules.

You don't have to read the contents unless you want to know what is in
the file. You could just open the file and call 'poll' and wait for
it to tell you something has happened. However this isn't likely to
be really useful.
It isn't the 'something has happened' event that is particularly
interesting. It is the 'the state is now X' information that is
interesting.
So you read the file to find out what the state is. If that isn't the
state you were looking for (or if you have finished responding to that
state), you poll/select, and then try again.

>
> > Events are signaled by an object manager calling
> > sysfs_notify(kobj, dir, attr);
> >
> > If the dir is non-NULL, it is used to find a subdirectory which
> > contains the attribute (presumably created by sysfs_create_group).
> >
> > This has a cost of one int per attribute, one wait_queuehead per kobject,
> > one int per open file.
> >
> So, all the attribute files for a given kobject will use the same
> wait queue? What happens if there are multiple attribute files
> polled for the same kobject.

wait_queues are sharable. Having one wait queue for a number of
events can work quite well.

Suppose a kobject has two (or more) attributes, and two processes poll on
those attributes, one each.
When either attribute gets changed and sysfs_notify is called, both
of the processes will be woken up. They will check if their
attribute of interest has changed. If it has, the poll syscall will
return to userspace. If it hasn't the process will just go to sleep
again.

So, if a kobject has many attributes, and there are many concurrent
processes listening on different attributes, then sharing a wait_queue
may cause a lot of needly wakeup/return-to-sleep events. But it is
fairly unlikely.
wait_queues are not tiny, and having one per kobject is more
economical than one per attribute.

I hope that helps.

> > diff ./fs/sysfs/inode.c~current~ ./fs/sysfs/inode.c
> > --- ./fs/sysfs/inode.c~current~ 2005-12-21 09:43:51.000000000 +1100
> > +++ ./fs/sysfs/inode.c 2005-12-21 09:43:52.000000000 +1100
> > @@ -247,3 +247,23 @@ void sysfs_hash_and_remove(struct dentry
> > }
> >
> >
> > +struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name)
> > +{
> > + struct sysfs_dirent * sd, * rv = NULL;
> > +
> > + if (dir->s_dentry == NULL ||
> > + dir->s_dentry->d_inode == NULL)
> > + return NULL;
> > +
> > + down(&dir->s_dentry->d_inode->i_sem);
> > + list_for_each_entry(sd, &dir->s_children, s_sibling) {
> > + if (!sd->s_element)
> > + continue;
> > + if (!strcmp(sysfs_get_name(sd), name)) {
> > + rv = sd;
> > + break;
> > + }
> > + }
> > + up(&dir->s_dentry->d_inode->i_sem);
> > + return rv;
> > +}
>
> I think there might be some issues here, if some other thread wants to
> remove the kobject, while this thread is trying to notify. Testing
> with some parallel add/delete operations should tell.

My idea - which I thought I had stuck in a comment, but apparently not
- is that the owner of the kobject should have it's own locking to
ensure that it doesn't go away while the sysfs_notify is being
called (md does in the places where it calls sysfs_notify).
I guess it makes sense to make sysfs_notify safe against object owners
doing silly things, but it wasn't clear to me either how, or that it
was necessary.
>
> In this case IMO, the better option is to do just lookup_one_len(), get
> the dentry and then get the sysfs_dirent as dentry->d_fsdata. And once
> done, do dput() which will release the references taken.

You may well be right. I'll try and see what happens.

>
> I think its time to queue a sysfs locking document in my todo list ;-)
>
That would be nice :-)

Thanks for your input.

NeilBrown

2005-12-22 05:50:29

by Maneesh Soni

[permalink] [raw]

Subject: Re: [PATCH - 2.6.15-rc5-mm3] Allow sysfs attribute files to be pollable.

On Thu, Dec 22, 2005 at 02:43:12PM +1100, Neil Brown wrote:
> On Wednesday December 21, [email protected] wrote:
> > >
> > > It works like this:
> > > Open the file
> > > Read all the contents.
> > > Call poll requesting POLLERR or POLLPRI (so select/exceptfds works)
> > > When poll returns, close the file, and go to top of loop.
> > >
> >
> > I am no "poll/select" expert, but is reading the contents always a
> > requirement for "poll"? If not then probably it is not a good idea to
> > put such rules.
>
> You don't have to read the contents unless you want to know what is in
> the file. You could just open the file and call 'poll' and wait for
> it to tell you something has happened. However this isn't likely to
> be really useful.
> It isn't the 'something has happened' event that is particularly
> interesting. It is the 'the state is now X' information that is
> interesting.
> So you read the file to find out what the state is. If that isn't the
> state you were looking for (or if you have finished responding to that
> state), you poll/select, and then try again.
>

ok.. that makes sense. But in this case [open() and then poll()], should
buffer->event() be initialized in sysfs_open()-->check_perm(), instead
of fill_read_buffer() ? I think this scheme should work for [open(), read()
and then poll()] also.

But how about the other rule, ie once woken-up the user has to close,
re-open and re-read the file. Can this also be avoided, as probably this is also
not poll semantics?

> >
> > > Events are signaled by an object manager calling
> > > sysfs_notify(kobj, dir, attr);
> > >
> > > If the dir is non-NULL, it is used to find a subdirectory which
> > > contains the attribute (presumably created by sysfs_create_group).
> > >
> > > This has a cost of one int per attribute, one wait_queuehead per kobject,
> > > one int per open file.
> > >
> > So, all the attribute files for a given kobject will use the same
> > wait queue? What happens if there are multiple attribute files
> > polled for the same kobject.
>
> wait_queues are sharable. Having one wait queue for a number of
> events can work quite well.
>
> Suppose a kobject has two (or more) attributes, and two processes poll on
> those attributes, one each.
> When either attribute gets changed and sysfs_notify is called, both
> of the processes will be woken up. They will check if their
> attribute of interest has changed. If it has, the poll syscall will
> return to userspace. If it hasn't the process will just go to sleep
> again.
>
> So, if a kobject has many attributes, and there are many concurrent
> processes listening on different attributes, then sharing a wait_queue
> may cause a lot of needly wakeup/return-to-sleep events. But it is
> fairly unlikely.
> wait_queues are not tiny, and having one per kobject is more
> economical than one per attribute.
>
> I hope that helps.
>
> > > diff ./fs/sysfs/inode.c~current~ ./fs/sysfs/inode.c
> > > --- ./fs/sysfs/inode.c~current~ 2005-12-21 09:43:51.000000000 +1100
> > > +++ ./fs/sysfs/inode.c 2005-12-21 09:43:52.000000000 +1100
> > > @@ -247,3 +247,23 @@ void sysfs_hash_and_remove(struct dentry
> > > }
> > >
> > >
> > > +struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name)
> > > +{
> > > + struct sysfs_dirent * sd, * rv = NULL;
> > > +
> > > + if (dir->s_dentry == NULL ||
> > > + dir->s_dentry->d_inode == NULL)
> > > + return NULL;
> > > +
> > > + down(&dir->s_dentry->d_inode->i_sem);
> > > + list_for_each_entry(sd, &dir->s_children, s_sibling) {
> > > + if (!sd->s_element)
> > > + continue;
> > > + if (!strcmp(sysfs_get_name(sd), name)) {
> > > + rv = sd;
> > > + break;
> > > + }
> > > + }
> > > + up(&dir->s_dentry->d_inode->i_sem);
> > > + return rv;
> > > +}
> >
> > I think there might be some issues here, if some other thread wants to
> > remove the kobject, while this thread is trying to notify. Testing
> > with some parallel add/delete operations should tell.
>
> My idea - which I thought I had stuck in a comment, but apparently not
> - is that the owner of the kobject should have it's own locking to
> ensure that it doesn't go away while the sysfs_notify is being
> called (md does in the places where it calls sysfs_notify).
> I guess it makes sense to make sysfs_notify safe against object owners
> doing silly things, but it wasn't clear to me either how, or that it
> was necessary.
> >

That's right, some times these kobjects do strange things like the
kobject is alive but it decides to get rid of attribute files. So, I think
if the VFS/dentry locking is used, it would be safer.

> > In this case IMO, the better option is to do just lookup_one_len(), get
> > the dentry and then get the sysfs_dirent as dentry->d_fsdata. And once
> > done, do dput() which will release the references taken.
>
> You may well be right. I'll try and see what happens.
>

Thanks
Maneesh

--
Maneesh Soni
Linux Technology Center,
IBM India Software Labs,
Bangalore, India
email: [email protected]
Phone: 91-80-25044990

2005-12-22 06:02:13

by NeilBrown

[permalink] [raw]

Subject: Re: [PATCH - 2.6.15-rc5-mm3] Allow sysfs attribute files to be pollable.

On Thursday December 22, [email protected] wrote:
> On Thu, Dec 22, 2005 at 02:43:12PM +1100, Neil Brown wrote:
> > You don't have to read the contents unless you want to know what is in
> > the file. You could just open the file and call 'poll' and wait for
> > it to tell you something has happened. However this isn't likely to
> > be really useful.
> > It isn't the 'something has happened' event that is particularly
> > interesting. It is the 'the state is now X' information that is
> > interesting.
> > So you read the file to find out what the state is. If that isn't the
> > state you were looking for (or if you have finished responding to that
> > state), you poll/select, and then try again.
> >
>
> ok.. that makes sense. But in this case [open() and then poll()], should
> buffer->event() be initialized in sysfs_open()-->check_perm(), instead
> of fill_read_buffer() ? I think this scheme should work for [open(), read()
> and then poll()] also.

We are definitely using poll in a non-standard way as it is generally
for "you can read now" or "you can write now", and we are (ab)using it
to say "there is new information". Note that this is essentially
copying the semantics of 'poll' on /proc/mounts.

I would see poll returning as meaning "there is state information that
you haven't read".

When you first open the file, you haven't read anything, so poll
should return immediately - which it currently does.
After you read something, poll won't return again until there is
something new to be read.

I think this is probably the best semantics, but if you try hard you
might be able to convince me otherwise...

>
> But how about the other rule, ie once woken-up the user has to close,
> re-open and re-read the file. Can this also be avoided, as probably this is also
> not poll semantics?

This semantic is part of sysfs. The way sysfs currently works, you
open a file, and read it, and that is the only value you see. If you
rewind and read again, you still get the old value, even if it
"should" have changed. The value is cached and the cache is never
refreshed.

sysfs could be changed to flush the cache on rewind, but I don't know
that it is worth it. If it was changed, the poll functionality would
automatically do the right thing.

Thanks again,

NeilBrown

2005-12-22 07:08:07

by Maneesh Soni

[permalink] [raw]

Subject: Re: [PATCH - 2.6.15-rc5-mm3] Allow sysfs attribute files to be pollable.

On Thu, Dec 22, 2005 at 05:02:02PM +1100, Neil Brown wrote:
> On Thursday December 22, [email protected] wrote:
> > On Thu, Dec 22, 2005 at 02:43:12PM +1100, Neil Brown wrote:
> > > You don't have to read the contents unless you want to know what is in
> > > the file. You could just open the file and call 'poll' and wait for
> > > it to tell you something has happened. However this isn't likely to
> > > be really useful.
> > > It isn't the 'something has happened' event that is particularly
> > > interesting. It is the 'the state is now X' information that is
> > > interesting.
> > > So you read the file to find out what the state is. If that isn't the
> > > state you were looking for (or if you have finished responding to that
> > > state), you poll/select, and then try again.
> > >
> >
> > ok.. that makes sense. But in this case [open() and then poll()], should
> > buffer->event() be initialized in sysfs_open()-->check_perm(), instead
> > of fill_read_buffer() ? I think this scheme should work for [open(), read()
> > and then poll()] also.
>
> We are definitely using poll in a non-standard way as it is generally
> for "you can read now" or "you can write now", and we are (ab)using it
> to say "there is new information". Note that this is essentially
> copying the semantics of 'poll' on /proc/mounts.
>
> I would see poll returning as meaning "there is state information that
> you haven't read".
>
> When you first open the file, you haven't read anything, so poll
> should return immediately - which it currently does.

If the current patch already follows the semantics you just
described (ie if polled right after open, return immediately)
then no problem.

> After you read something, poll won't return again until there is
> something new to be read.
>
> I think this is probably the best semantics, but if you try hard you
> might be able to convince me otherwise...
>
> >
> > But how about the other rule, ie once woken-up the user has to close,
> > re-open and re-read the file. Can this also be avoided, as probably this is also
> > not poll semantics?
>
> This semantic is part of sysfs. The way sysfs currently works, you
> open a file, and read it, and that is the only value you see. If you
> rewind and read again, you still get the old value, even if it
> "should" have changed. The value is cached and the cache is never
> refreshed.
>
> sysfs could be changed to flush the cache on rewind, but I don't know
> that it is worth it. If it was changed, the poll functionality would
> automatically do the right thing.
>

IMHO, it is worthy enough if it can allow "poll" to have usual semantics.

Thanks
Maneesh

--
Maneesh Soni
Linux Technology Center,
IBM India Software Labs,
Bangalore, India
email: [email protected]
Phone: 91-80-25044990