2011-06-01 12:14:39

by Lucas De Marchi

[permalink] [raw]
Subject: [PATCH] sysctl: add support for poll()

Adding support for poll() in sysctl fs allows userspace to receive
notifications when an entry in sysctl changes. This way it's possible to
know when hostname/domainname is changed due to the respective syscall
has been called or its file under /proc/sys has been written to.

Signed-off-by: Lucas De Marchi <[email protected]>
---
fs/proc/proc_sysctl.c | 40 ++++++++++++++++++++++++++++++++++++++++
include/linux/sysctl.h | 8 ++++++++
include/linux/utsname.h | 16 ++++++++++++++++
kernel/sys.c | 2 ++
kernel/utsname_sysctl.c | 36 ++++++++++++++++++++++++++++++++++++
5 files changed, 102 insertions(+), 0 deletions(-)

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index f50133c..2e5d3ec 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -3,6 +3,7 @@
*/
#include <linux/init.h>
#include <linux/sysctl.h>
+#include <linux/poll.h>
#include <linux/proc_fs.h>
#include <linux/security.h>
#include <linux/namei.h>
@@ -176,6 +177,43 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
}

+static int proc_sys_open(struct inode *inode, struct file *filp)
+{
+ struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+
+ if (table->poll) {
+ unsigned long event = atomic_read(&table->poll->event);
+
+ filp->private_data = (void *)event;
+ }
+
+ return 0;
+}
+
+static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
+{
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ unsigned long event = (unsigned long)filp->private_data;
+ unsigned int ret = POLLIN | POLLRDNORM;
+
+ if (!table->proc_handler)
+ goto out;
+
+ if (!table->poll)
+ goto out;
+
+ poll_wait(filp, &table->poll->wait, wait);
+
+ if (event != atomic_read(&table->poll->event)) {
+ filp->private_data = (void *)(unsigned long)atomic_read(
+ &table->poll->event);
+ ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
+ }
+
+out:
+ return ret;
+}

static int proc_sys_fill_cache(struct file *filp, void *dirent,
filldir_t filldir,
@@ -367,6 +405,8 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
}

static const struct file_operations proc_sys_file_operations = {
+ .open = proc_sys_open,
+ .poll = proc_sys_poll,
.read = proc_sys_read,
.write = proc_sys_write,
.llseek = default_llseek,
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 11684d9..96c89ba 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -25,6 +25,7 @@
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/compiler.h>
+#include <linux/wait.h>

struct completion;

@@ -1011,6 +1012,12 @@ extern int proc_do_large_bitmap(struct ctl_table *, int,
* cover common cases.
*/

+/* Support for userspace poll() to watch for changes */
+struct ctl_table_poll {
+ atomic_t event;
+ wait_queue_head_t wait;
+};
+
/* A sysctl table is an array of struct ctl_table: */
struct ctl_table
{
@@ -1021,6 +1028,7 @@ struct ctl_table
struct ctl_table *child;
struct ctl_table *parent; /* Automatically set */
proc_handler *proc_handler; /* Callback for text formatting */
+ struct ctl_table_poll *poll;
void *extra1;
void *extra2;
};
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 4e5b021..c714ed7 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -37,6 +37,14 @@ struct new_utsname {
#include <linux/nsproxy.h>
#include <linux/err.h>

+enum uts_proc {
+ UTS_PROC_OSTYPE,
+ UTS_PROC_OSRELEASE,
+ UTS_PROC_VERSION,
+ UTS_PROC_HOSTNAME,
+ UTS_PROC_DOMAINNAME,
+};
+
struct user_namespace;
extern struct user_namespace init_user_ns;

@@ -80,6 +88,14 @@ static inline struct uts_namespace *copy_utsname(unsigned long flags,
}
#endif

+#ifdef CONFIG_PROC_SYSCTL
+extern void uts_proc_notify(enum uts_proc proc);
+#else
+static inline void uts_proc_notify(enum uts_proc proc)
+{
+}
+#endif
+
static inline struct new_utsname *utsname(void)
{
return &current->nsproxy->uts_ns->name;
diff --git a/kernel/sys.c b/kernel/sys.c
index e4128b2..ada9cd7 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1211,6 +1211,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
memset(u->nodename + len, 0, sizeof(u->nodename) - len);
errno = 0;
}
+ uts_proc_notify(UTS_PROC_HOSTNAME);
up_write(&uts_sem);
return errno;
}
@@ -1261,6 +1262,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
memset(u->domainname + len, 0, sizeof(u->domainname) - len);
errno = 0;
}
+ uts_proc_notify(UTS_PROC_DOMAINNAME);
up_write(&uts_sem);
return errno;
}
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index a2cd77e..e96b766 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -13,6 +13,7 @@
#include <linux/uts.h>
#include <linux/utsname.h>
#include <linux/sysctl.h>
+#include <linux/wait.h>

static void *get_uts(ctl_table *table, int write)
{
@@ -51,12 +52,28 @@ static int proc_do_uts_string(ctl_table *table, int write,
uts_table.data = get_uts(table, write);
r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
put_uts(table, write, uts_table.data);
+
+ if (write) {
+ atomic_inc(&table->poll->event);
+ wake_up_interruptible(&table->poll->wait);
+ }
+
return r;
}
#else
#define proc_do_uts_string NULL
#endif

+static struct ctl_table_poll hostname_poll = {
+ .event = ATOMIC_INIT(0),
+ .wait = __WAIT_QUEUE_HEAD_INITIALIZER(hostname_poll.wait),
+};
+
+static struct ctl_table_poll domainname_poll = {
+ .event = ATOMIC_INIT(0),
+ .wait = __WAIT_QUEUE_HEAD_INITIALIZER(domainname_poll.wait),
+};
+
static struct ctl_table uts_kern_table[] = {
{
.procname = "ostype",
@@ -85,6 +102,7 @@ static struct ctl_table uts_kern_table[] = {
.maxlen = sizeof(init_uts_ns.name.nodename),
.mode = 0644,
.proc_handler = proc_do_uts_string,
+ .poll = &hostname_poll,
},
{
.procname = "domainname",
@@ -92,6 +110,7 @@ static struct ctl_table uts_kern_table[] = {
.maxlen = sizeof(init_uts_ns.name.domainname),
.mode = 0644,
.proc_handler = proc_do_uts_string,
+ .poll = &domainname_poll,
},
{}
};
@@ -105,6 +124,23 @@ static struct ctl_table uts_root_table[] = {
{}
};

+#ifdef CONFIG_PROC_SYSCTL
+/*
+ * Notify userspace about a change in a certain entry of uts_kern_table,
+ * identified by the parameter proc.
+ */
+void uts_proc_notify(enum uts_proc proc)
+{
+ struct ctl_table *table = &uts_kern_table[proc];
+
+ if (!table->poll)
+ return;
+
+ atomic_inc(&table->poll->event);
+ wake_up_interruptible(&table->poll->wait);
+}
+#endif
+
static int __init utsname_sysctl_init(void)
{
register_sysctl_table(uts_root_table);
--
1.7.5.2


2011-06-02 02:52:00

by Lucas De Marchi

[permalink] [raw]
Subject: Re: [PATCH] sysctl: add support for poll()

CC'ing people as suggested by get_maintainer.pl

On Wed, Jun 1, 2011 at 9:14 AM, Lucas De Marchi
<[email protected]> wrote:
> Adding support for poll() in sysctl fs allows userspace to receive
> notifications when an entry in sysctl changes. This way it's possible to
> know when hostname/domainname is changed due to the respective syscall
> has been called or its file under /proc/sys has been written to.
>
> Signed-off-by: Lucas De Marchi <[email protected]>
> ---
> ?fs/proc/proc_sysctl.c ? | ? 40 ++++++++++++++++++++++++++++++++++++++++
> ?include/linux/sysctl.h ?| ? ?8 ++++++++
> ?include/linux/utsname.h | ? 16 ++++++++++++++++
> ?kernel/sys.c ? ? ? ? ? ?| ? ?2 ++
> ?kernel/utsname_sysctl.c | ? 36 ++++++++++++++++++++++++++++++++++++
> ?5 files changed, 102 insertions(+), 0 deletions(-)
>
> diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
> index f50133c..2e5d3ec 100644
> --- a/fs/proc/proc_sysctl.c
> +++ b/fs/proc/proc_sysctl.c
> @@ -3,6 +3,7 @@
> ?*/
> ?#include <linux/init.h>
> ?#include <linux/sysctl.h>
> +#include <linux/poll.h>
> ?#include <linux/proc_fs.h>
> ?#include <linux/security.h>
> ?#include <linux/namei.h>
> @@ -176,6 +177,43 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
> ? ? ? ?return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
> ?}
>
> +static int proc_sys_open(struct inode *inode, struct file *filp)
> +{
> + ? ? ? struct ctl_table *table = PROC_I(inode)->sysctl_entry;
> +
> + ? ? ? if (table->poll) {
> + ? ? ? ? ? ? ? unsigned long event = atomic_read(&table->poll->event);
> +
> + ? ? ? ? ? ? ? filp->private_data = (void *)event;
> + ? ? ? }
> +
> + ? ? ? return 0;
> +}
> +
> +static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
> +{
> + ? ? ? struct inode *inode = filp->f_path.dentry->d_inode;
> + ? ? ? struct ctl_table *table = PROC_I(inode)->sysctl_entry;
> + ? ? ? unsigned long event = (unsigned long)filp->private_data;
> + ? ? ? unsigned int ret = POLLIN | POLLRDNORM;
> +
> + ? ? ? if (!table->proc_handler)
> + ? ? ? ? ? ? ? goto out;
> +
> + ? ? ? if (!table->poll)
> + ? ? ? ? ? ? ? goto out;
> +
> + ? ? ? poll_wait(filp, &table->poll->wait, wait);
> +
> + ? ? ? if (event != atomic_read(&table->poll->event)) {
> + ? ? ? ? ? ? ? filp->private_data = (void *)(unsigned long)atomic_read(
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? &table->poll->event);
> + ? ? ? ? ? ? ? ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
> + ? ? ? }
> +
> +out:
> + ? ? ? return ret;
> +}
>
> ?static int proc_sys_fill_cache(struct file *filp, void *dirent,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?filldir_t filldir,
> @@ -367,6 +405,8 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
> ?}
>
> ?static const struct file_operations proc_sys_file_operations = {
> + ? ? ? .open ? ? ? ? ? = proc_sys_open,
> + ? ? ? .poll ? ? ? ? ? = proc_sys_poll,
> ? ? ? ?.read ? ? ? ? ? = proc_sys_read,
> ? ? ? ?.write ? ? ? ? ?= proc_sys_write,
> ? ? ? ?.llseek ? ? ? ? = default_llseek,
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
> index 11684d9..96c89ba 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -25,6 +25,7 @@
> ?#include <linux/kernel.h>
> ?#include <linux/types.h>
> ?#include <linux/compiler.h>
> +#include <linux/wait.h>
>
> ?struct completion;
>
> @@ -1011,6 +1012,12 @@ extern int proc_do_large_bitmap(struct ctl_table *, int,
> ?* cover common cases.
> ?*/
>
> +/* Support for userspace poll() to watch for changes */
> +struct ctl_table_poll {
> + ? ? ? atomic_t event;
> + ? ? ? wait_queue_head_t wait;
> +};
> +
> ?/* A sysctl table is an array of struct ctl_table: */
> ?struct ctl_table
> ?{
> @@ -1021,6 +1028,7 @@ struct ctl_table
> ? ? ? ?struct ctl_table *child;
> ? ? ? ?struct ctl_table *parent; ? ? ? /* Automatically set */
> ? ? ? ?proc_handler *proc_handler; ? ? /* Callback for text formatting */
> + ? ? ? struct ctl_table_poll *poll;
> ? ? ? ?void *extra1;
> ? ? ? ?void *extra2;
> ?};
> diff --git a/include/linux/utsname.h b/include/linux/utsname.h
> index 4e5b021..c714ed7 100644
> --- a/include/linux/utsname.h
> +++ b/include/linux/utsname.h
> @@ -37,6 +37,14 @@ struct new_utsname {
> ?#include <linux/nsproxy.h>
> ?#include <linux/err.h>
>
> +enum uts_proc {
> + ? ? ? UTS_PROC_OSTYPE,
> + ? ? ? UTS_PROC_OSRELEASE,
> + ? ? ? UTS_PROC_VERSION,
> + ? ? ? UTS_PROC_HOSTNAME,
> + ? ? ? UTS_PROC_DOMAINNAME,
> +};
> +
> ?struct user_namespace;
> ?extern struct user_namespace init_user_ns;
>
> @@ -80,6 +88,14 @@ static inline struct uts_namespace *copy_utsname(unsigned long flags,
> ?}
> ?#endif
>
> +#ifdef CONFIG_PROC_SYSCTL
> +extern void uts_proc_notify(enum uts_proc proc);
> +#else
> +static inline void uts_proc_notify(enum uts_proc proc)
> +{
> +}
> +#endif
> +
> ?static inline struct new_utsname *utsname(void)
> ?{
> ? ? ? ?return &current->nsproxy->uts_ns->name;
> diff --git a/kernel/sys.c b/kernel/sys.c
> index e4128b2..ada9cd7 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -1211,6 +1211,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
> ? ? ? ? ? ? ? ?memset(u->nodename + len, 0, sizeof(u->nodename) - len);
> ? ? ? ? ? ? ? ?errno = 0;
> ? ? ? ?}
> + ? ? ? uts_proc_notify(UTS_PROC_HOSTNAME);
> ? ? ? ?up_write(&uts_sem);
> ? ? ? ?return errno;
> ?}
> @@ -1261,6 +1262,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
> ? ? ? ? ? ? ? ?memset(u->domainname + len, 0, sizeof(u->domainname) - len);
> ? ? ? ? ? ? ? ?errno = 0;
> ? ? ? ?}
> + ? ? ? uts_proc_notify(UTS_PROC_DOMAINNAME);
> ? ? ? ?up_write(&uts_sem);
> ? ? ? ?return errno;
> ?}
> diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
> index a2cd77e..e96b766 100644
> --- a/kernel/utsname_sysctl.c
> +++ b/kernel/utsname_sysctl.c
> @@ -13,6 +13,7 @@
> ?#include <linux/uts.h>
> ?#include <linux/utsname.h>
> ?#include <linux/sysctl.h>
> +#include <linux/wait.h>
>
> ?static void *get_uts(ctl_table *table, int write)
> ?{
> @@ -51,12 +52,28 @@ static int proc_do_uts_string(ctl_table *table, int write,
> ? ? ? ?uts_table.data = get_uts(table, write);
> ? ? ? ?r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
> ? ? ? ?put_uts(table, write, uts_table.data);
> +
> + ? ? ? if (write) {
> + ? ? ? ? ? ? ? atomic_inc(&table->poll->event);
> + ? ? ? ? ? ? ? wake_up_interruptible(&table->poll->wait);
> + ? ? ? }
> +
> ? ? ? ?return r;
> ?}
> ?#else
> ?#define proc_do_uts_string NULL
> ?#endif
>
> +static struct ctl_table_poll hostname_poll = {
> + ? ? ? .event ? ? ? ? ?= ATOMIC_INIT(0),
> + ? ? ? .wait ? ? ? ? ? = __WAIT_QUEUE_HEAD_INITIALIZER(hostname_poll.wait),
> +};
> +
> +static struct ctl_table_poll domainname_poll = {
> + ? ? ? .event ? ? ? ? ?= ATOMIC_INIT(0),
> + ? ? ? .wait ? ? ? ? ? = __WAIT_QUEUE_HEAD_INITIALIZER(domainname_poll.wait),
> +};
> +
> ?static struct ctl_table uts_kern_table[] = {
> ? ? ? ?{
> ? ? ? ? ? ? ? ?.procname ? ? ? = "ostype",
> @@ -85,6 +102,7 @@ static struct ctl_table uts_kern_table[] = {
> ? ? ? ? ? ? ? ?.maxlen ? ? ? ? = sizeof(init_uts_ns.name.nodename),
> ? ? ? ? ? ? ? ?.mode ? ? ? ? ? = 0644,
> ? ? ? ? ? ? ? ?.proc_handler ? = proc_do_uts_string,
> + ? ? ? ? ? ? ? .poll ? ? ? ? ? = &hostname_poll,
> ? ? ? ?},
> ? ? ? ?{
> ? ? ? ? ? ? ? ?.procname ? ? ? = "domainname",
> @@ -92,6 +110,7 @@ static struct ctl_table uts_kern_table[] = {
> ? ? ? ? ? ? ? ?.maxlen ? ? ? ? = sizeof(init_uts_ns.name.domainname),
> ? ? ? ? ? ? ? ?.mode ? ? ? ? ? = 0644,
> ? ? ? ? ? ? ? ?.proc_handler ? = proc_do_uts_string,
> + ? ? ? ? ? ? ? .poll ? ? ? ? ? = &domainname_poll,
> ? ? ? ?},
> ? ? ? ?{}
> ?};
> @@ -105,6 +124,23 @@ static struct ctl_table uts_root_table[] = {
> ? ? ? ?{}
> ?};
>
> +#ifdef CONFIG_PROC_SYSCTL
> +/*
> + * Notify userspace about a change in a certain entry of uts_kern_table,
> + * identified by the parameter proc.
> + */
> +void uts_proc_notify(enum uts_proc proc)
> +{
> + ? ? ? struct ctl_table *table = &uts_kern_table[proc];
> +
> + ? ? ? if (!table->poll)
> + ? ? ? ? ? ? ? return;
> +
> + ? ? ? atomic_inc(&table->poll->event);
> + ? ? ? wake_up_interruptible(&table->poll->wait);
> +}
> +#endif
> +
> ?static int __init utsname_sysctl_init(void)
> ?{
> ? ? ? ?register_sysctl_table(uts_root_table);
> --
> 1.7.5.2
>
>

2011-06-02 03:32:09

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] sysctl: add support for poll()

Lucas De Marchi <[email protected]> writes:

> CC'ing people as suggested by get_maintainer.pl
>
> On Wed, Jun 1, 2011 at 9:14 AM, Lucas De Marchi
> <[email protected]> wrote:
>> Adding support for poll() in sysctl fs allows userspace to receive
>> notifications when an entry in sysctl changes. This way it's possible to
>> know when hostname/domainname is changed due to the respective syscall
>> has been called or its file under /proc/sys has been written to.

Why do you want to do this? What advantage does this bring to
userspace?

That feels like a pretty big special case.

Eric


>> Signed-off-by: Lucas De Marchi <[email protected]>
>> ---
>>  fs/proc/proc_sysctl.c   |   40 ++++++++++++++++++++++++++++++++++++++++
>>  include/linux/sysctl.h  |    8 ++++++++
>>  include/linux/utsname.h |   16 ++++++++++++++++
>>  kernel/sys.c            |    2 ++
>>  kernel/utsname_sysctl.c |   36 ++++++++++++++++++++++++++++++++++++
>>  5 files changed, 102 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
>> index f50133c..2e5d3ec 100644
>> --- a/fs/proc/proc_sysctl.c
>> +++ b/fs/proc/proc_sysctl.c
>> @@ -3,6 +3,7 @@
>>  */
>>  #include <linux/init.h>
>>  #include <linux/sysctl.h>
>> +#include <linux/poll.h>
>>  #include <linux/proc_fs.h>
>>  #include <linux/security.h>
>>  #include <linux/namei.h>
>> @@ -176,6 +177,43 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
>>        return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
>>  }
>>
>> +static int proc_sys_open(struct inode *inode, struct file *filp)
>> +{
>> +       struct ctl_table *table = PROC_I(inode)->sysctl_entry;
>> +
>> +       if (table->poll) {
>> +               unsigned long event = atomic_read(&table->poll->event);
>> +
>> +               filp->private_data = (void *)event;
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>> +static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
>> +{
>> +       struct inode *inode = filp->f_path.dentry->d_inode;
>> +       struct ctl_table *table = PROC_I(inode)->sysctl_entry;
>> +       unsigned long event = (unsigned long)filp->private_data;
>> +       unsigned int ret = POLLIN | POLLRDNORM;
>> +
>> +       if (!table->proc_handler)
>> +               goto out;
>> +
>> +       if (!table->poll)
>> +               goto out;
>> +
>> +       poll_wait(filp, &table->poll->wait, wait);
>> +
>> +       if (event != atomic_read(&table->poll->event)) {
>> +               filp->private_data = (void *)(unsigned long)atomic_read(
>> +                                               &table->poll->event);
>> +               ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
>> +       }
>> +
>> +out:
>> +       return ret;
>> +}
>>
>>  static int proc_sys_fill_cache(struct file *filp, void *dirent,
>>                                filldir_t filldir,
>> @@ -367,6 +405,8 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
>>  }
>>
>>  static const struct file_operations proc_sys_file_operations = {
>> +       .open           = proc_sys_open,
>> +       .poll           = proc_sys_poll,
>>        .read           = proc_sys_read,
>>        .write          = proc_sys_write,
>>        .llseek         = default_llseek,
>> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
>> index 11684d9..96c89ba 100644
>> --- a/include/linux/sysctl.h
>> +++ b/include/linux/sysctl.h
>> @@ -25,6 +25,7 @@
>>  #include <linux/kernel.h>
>>  #include <linux/types.h>
>>  #include <linux/compiler.h>
>> +#include <linux/wait.h>
>>
>>  struct completion;
>>
>> @@ -1011,6 +1012,12 @@ extern int proc_do_large_bitmap(struct ctl_table *, int,
>>  * cover common cases.
>>  */
>>
>> +/* Support for userspace poll() to watch for changes */
>> +struct ctl_table_poll {
>> +       atomic_t event;
>> +       wait_queue_head_t wait;
>> +};
>> +
>>  /* A sysctl table is an array of struct ctl_table: */
>>  struct ctl_table
>>  {
>> @@ -1021,6 +1028,7 @@ struct ctl_table
>>        struct ctl_table *child;
>>        struct ctl_table *parent;       /* Automatically set */
>>        proc_handler *proc_handler;     /* Callback for text formatting */
>> +       struct ctl_table_poll *poll;
>>        void *extra1;
>>        void *extra2;
>>  };
>> diff --git a/include/linux/utsname.h b/include/linux/utsname.h
>> index 4e5b021..c714ed7 100644
>> --- a/include/linux/utsname.h
>> +++ b/include/linux/utsname.h
>> @@ -37,6 +37,14 @@ struct new_utsname {
>>  #include <linux/nsproxy.h>
>>  #include <linux/err.h>
>>
>> +enum uts_proc {
>> +       UTS_PROC_OSTYPE,
>> +       UTS_PROC_OSRELEASE,
>> +       UTS_PROC_VERSION,
>> +       UTS_PROC_HOSTNAME,
>> +       UTS_PROC_DOMAINNAME,
>> +};
>> +
>>  struct user_namespace;
>>  extern struct user_namespace init_user_ns;
>>
>> @@ -80,6 +88,14 @@ static inline struct uts_namespace *copy_utsname(unsigned long flags,
>>  }
>>  #endif
>>
>> +#ifdef CONFIG_PROC_SYSCTL
>> +extern void uts_proc_notify(enum uts_proc proc);
>> +#else
>> +static inline void uts_proc_notify(enum uts_proc proc)
>> +{
>> +}
>> +#endif
>> +
>>  static inline struct new_utsname *utsname(void)
>>  {
>>        return &current->nsproxy->uts_ns->name;
>> diff --git a/kernel/sys.c b/kernel/sys.c
>> index e4128b2..ada9cd7 100644
>> --- a/kernel/sys.c
>> +++ b/kernel/sys.c
>> @@ -1211,6 +1211,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
>>                memset(u->nodename + len, 0, sizeof(u->nodename) - len);
>>                errno = 0;
>>        }
>> +       uts_proc_notify(UTS_PROC_HOSTNAME);
>>        up_write(&uts_sem);
>>        return errno;
>>  }
>> @@ -1261,6 +1262,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
>>                memset(u->domainname + len, 0, sizeof(u->domainname) - len);
>>                errno = 0;
>>        }
>> +       uts_proc_notify(UTS_PROC_DOMAINNAME);
>>        up_write(&uts_sem);
>>        return errno;
>>  }
>> diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
>> index a2cd77e..e96b766 100644
>> --- a/kernel/utsname_sysctl.c
>> +++ b/kernel/utsname_sysctl.c
>> @@ -13,6 +13,7 @@
>>  #include <linux/uts.h>
>>  #include <linux/utsname.h>
>>  #include <linux/sysctl.h>
>> +#include <linux/wait.h>
>>
>>  static void *get_uts(ctl_table *table, int write)
>>  {
>> @@ -51,12 +52,28 @@ static int proc_do_uts_string(ctl_table *table, int write,
>>        uts_table.data = get_uts(table, write);
>>        r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
>>        put_uts(table, write, uts_table.data);
>> +
>> +       if (write) {
>> +               atomic_inc(&table->poll->event);
>> +               wake_up_interruptible(&table->poll->wait);
>> +       }
>> +
>>        return r;
>>  }
>>  #else
>>  #define proc_do_uts_string NULL
>>  #endif
>>
>> +static struct ctl_table_poll hostname_poll = {
>> +       .event          = ATOMIC_INIT(0),
>> +       .wait           = __WAIT_QUEUE_HEAD_INITIALIZER(hostname_poll.wait),
>> +};
>> +
>> +static struct ctl_table_poll domainname_poll = {
>> +       .event          = ATOMIC_INIT(0),
>> +       .wait           = __WAIT_QUEUE_HEAD_INITIALIZER(domainname_poll.wait),
>> +};
>> +
>>  static struct ctl_table uts_kern_table[] = {
>>        {
>>                .procname       = "ostype",
>> @@ -85,6 +102,7 @@ static struct ctl_table uts_kern_table[] = {
>>                .maxlen         = sizeof(init_uts_ns.name.nodename),
>>                .mode           = 0644,
>>                .proc_handler   = proc_do_uts_string,
>> +               .poll           = &hostname_poll,
>>        },
>>        {
>>                .procname       = "domainname",
>> @@ -92,6 +110,7 @@ static struct ctl_table uts_kern_table[] = {
>>                .maxlen         = sizeof(init_uts_ns.name.domainname),
>>                .mode           = 0644,
>>                .proc_handler   = proc_do_uts_string,
>> +               .poll           = &domainname_poll,
>>        },
>>        {}
>>  };
>> @@ -105,6 +124,23 @@ static struct ctl_table uts_root_table[] = {
>>        {}
>>  };
>>
>> +#ifdef CONFIG_PROC_SYSCTL
>> +/*
>> + * Notify userspace about a change in a certain entry of uts_kern_table,
>> + * identified by the parameter proc.
>> + */
>> +void uts_proc_notify(enum uts_proc proc)
>> +{
>> +       struct ctl_table *table = &uts_kern_table[proc];
>> +
>> +       if (!table->poll)
>> +               return;
>> +
>> +       atomic_inc(&table->poll->event);
>> +       wake_up_interruptible(&table->poll->wait);
>> +}
>> +#endif
>> +
>>  static int __init utsname_sysctl_init(void)
>>  {
>>        register_sysctl_table(uts_root_table);
>> --
>> 1.7.5.2
>>
>>

2011-06-02 12:06:47

by Kay Sievers

[permalink] [raw]
Subject: Re: [PATCH] sysctl: add support for poll()

On Thu, Jun 2, 2011 at 05:31, Eric W. Biederman <[email protected]> wrote:
> Lucas De Marchi <[email protected]> writes:
>
>> CC'ing people as suggested by get_maintainer.pl
>>
>> On Wed, Jun 1, 2011 at 9:14 AM, Lucas De Marchi
>> <[email protected]> wrote:
>>> Adding support for poll() in sysctl fs allows userspace to receive
>>> notifications when an entry in sysctl changes. This way it's possible to
>>> know when hostname/domainname is changed due to the respective syscall
>>> has been called or its file under /proc/sys has been written to.
>
> Why do you want to do this?  What advantage does this bring to
> userspace?

Host names are dynamic, can change during system runtime by dhcp or
similar setups, or just get changed by the user.

System management tools/services needs to track these changes to
propagate the actual host name into the running services. It's the
same like we dynamically track /proc/mounts, /proc/swaps, or any other
stuff that can change underneath us, and forces other things to adapt
to the changing environment.

> That feels like a pretty big special case.

The alternative is to have a process constantly polling and reading
the file, which is nothing we even want to think about in 2011.

It's just another special case to bring us out of the UNIX stone age
of doing things. :)

Kay

2011-06-02 12:43:16

by Alan

[permalink] [raw]
Subject: Re: [PATCH] sysctl: add support for poll()

> Host names are dynamic, can change during system runtime by dhcp or
> similar setups, or just get changed by the user.

I don't actually see what this has to do with utsname. uname historically
defined nodename as "name within an implementation-defined communications
network" and actually tended to be the UUCP name. Modern SuS says "`the
name of the node of the communications network to which this node is
attached, if any"

The latter unfortunately makes no sense anyway and is a fine example of
standards body cluelessness as name mapping on IP networks is not one
name per host, and also because the standard doesn't require the fields
in the struct are long enough to hold a DNS name!

(Indeed in its usual head up backside manner its technically valid to
define

char nodename[1];

and have only \0 as a valid reply)

Realistically all the naming policy has to be in userspace, and that
means that all the userspace bits have to agree on what they use utsname
for (or indeed if they are wise not use nodename in the first place) so
they can handle name management outside of kernel.

In the DHCP case for example a node which is acquiring a DHCP address on
wireless and on ethernet at the same time (very typical) has to make sure
all its DHCP and other address management policies agree rather than have
the two have a kernel food fight over the name to use.

So this all belongs in conman, network manager and other similar
systems and I'd suggest they try and deal with the meaningful IP world
questions of "what list of names do I posess for purpose X on network Y"
and "is this one of my names on ..."

> The alternative is to have a process constantly polling and reading
> the file, which is nothing we even want to think about in 2011.

Or to manage it properly.

> It's just another special case to bring us out of the UNIX stone age
> of doing things. :)

Unfortunately not. It's a misguided attempt to follow stone age Unix 'one
short name' policy. Forget utsname node names, they are a historical
quirk of UUCP and friends and on many OS platforms will be limited to 15
chars !

As to poll in general I can see some of the other proc files being
more relevant, eg for process monitoring tools being able to poll
in /proc/<pid> and some of the proc/sys and sysctl data that does change
meaningfully. Utsname however is not one of those things.


Alan

2011-06-02 13:02:12

by Kay Sievers

[permalink] [raw]
Subject: Re: [PATCH] sysctl: add support for poll()

On Thu, Jun 2, 2011 at 14:43, Alan Cox <[email protected]> wrote:
> So this all belongs in conman, network manager and other similar
> systems and I'd suggest they try and deal with the meaningful IP world
> questions of "what list of names do I posess for purpose X on network Y"
> and "is this one of my names on ..."

Higher level interfaces already live in the system services to set and
get the names in various ways. But we also need to support changes by
the old and simple tools. Calling hostname(1) must work just like
everything else. It changes state in the kernel, we need to know it
and adapt to it.

>> The alternative is to have a process constantly polling and reading
>> the file, which is nothing we even want to think about in 2011.
>
> Or to manage it properly.

That's just a dream. We do it properly, but we have no control over
everything. You've been yourself in that position, and have seen that
some stuff will 'never' change, we better just support it instead of
hoping it will not be used. Especially when it's that damn simple and
clean.

>> It's just another special case to bring us out of the UNIX stone age
>> of doing things. :)
>
> Unfortunately not. It's a misguided attempt to follow stone age Unix 'one
> short name' policy. Forget utsname node names, they are a historical
> quirk of UUCP and friends and on many OS platforms will be limited to 15
> chars !

That's not a problem. We support pretty names in the higher stack just
fine, but there is a tone of stuff that depends on gethostname(),
sethostname() and we have to support that.

> As to poll in general I can see some of the other proc files being
> more relevant, eg for process monitoring tools being able to poll
> in /proc/<pid> and some of the proc/sys and sysctl data that does change
> meaningfully.

Yeah, being able to watch a pid with such a simple interface would be
great to have.

> Utsname however is not one of those things.

I think it is.

As always, what are the alternatives you would propose, to watch
sethostname() changes to the system. And 'it should not be used', or
'it should be centrally intercepted' does not count, the reality will
go on without such rules.

Kay

2011-06-02 13:03:12

by Lucas De Marchi

[permalink] [raw]
Subject: Re: [PATCH] sysctl: add support for poll()

On Thu, Jun 2, 2011 at 9:43 AM, Alan Cox <[email protected]> wrote:
>> The alternative is to have a process constantly polling and reading
>> the file, which is nothing we even want to think about in 2011.
>
> Or to manage it properly.

What if the user decides do invoke sethostname syscall "by hand"?
Hostname would change beneath any other process that is trying to
manage it properly. What this patch does is to notify that process
that something happened.


>> It's just another special case to bring us out of the UNIX stone age
>> of doing things. :)
>
> Unfortunately not. It's a misguided attempt to follow stone age Unix 'one
> short name' policy. Forget utsname node names, they are a historical
> quirk of UUCP and friends and on many OS platforms will be limited to 15
> chars !
>
> As to poll in general I can see some of the other proc files being
> more relevant, eg for process monitoring tools being able to poll
> in /proc/<pid> and some of the proc/sys and sysctl data that does change
> meaningfully. Utsname however is not one of those things.
>

With this patch in, if anyone wants to manage a file under /proc/sys
there's really a small amount of code to write. He only has to define
the new poll struct for that file.


Lucas De Marchi

2011-06-02 13:11:42

by Alan

[permalink] [raw]
Subject: Re: [PATCH] sysctl: add support for poll()

> > Or to manage it properly.
>
> What if the user decides do invoke sethostname syscall "by hand"?
> Hostname would change beneath any other process that is trying to
> manage it properly. What this patch does is to notify that process
> that something happened.

That is a stupid argument. Shall we extend it to its logical idiotic end
and ask

"What if the user decides to recompile their kernel without sysfs poll
support ?"

You have to be root to run sethostname, at which point you are
realistically at the command line, a superuser and you know what you are
doing (eg using sethostname for non IP network naming, or cluster id, or
other stuff).

> With this patch in, if anyone wants to manage a file under /proc/sys
> there's really a small amount of code to write. He only has to define
> the new poll struct for that file.

Sure - and there is an 8 byte cost per sysctl node (of which we have
rather a lot), and we really need to tackle sysfs not sysctl anyway.

I'm not averse to pollable sysfs/sysctl nodes at all although the memory
hit on sysfs is going to be tricky to manage and need clever code.

I just think the utsname is a completely misguided example and whoever is
trying to do it doesn't actually understand the limits of utsname.

Alan

2011-06-02 13:24:28

by Kay Sievers

[permalink] [raw]
Subject: Re: [PATCH] sysctl: add support for poll()

On Thu, Jun 2, 2011 at 15:12, Alan Cox <[email protected]> wrote:
>> > Or to manage it properly.
>>
>> What if the user decides do invoke sethostname syscall "by hand"?
>> Hostname would change beneath any other process that is trying to
>> manage it properly. What this patch does is to notify that process
>> that something happened.
>
> That is a stupid argument. Shall we extend it to its logical idiotic end
> and ask
>
> "What if the user decides to recompile their kernel without sysfs poll
> support ?"

Alan please! This is not something we haven't thought through.

> You have to be root to run sethostname, at which point you are
> realistically at the command line, a superuser and you know what you are
> doing (eg using sethostname for non IP network naming, or cluster id, or
> other stuff).

Please stay to the actual problem this patch tries to resolve.

>> With this patch in, if anyone wants to manage a file under /proc/sys
>> there's really a small amount of code to write. He only has to define
>> the new poll struct for that file.
>
> Sure - and there is an 8 byte cost per sysctl node (of which we have
> rather a lot), and we really need to tackle sysfs not sysctl anyway.

It is. And we will very likely need poll() for other things in
/proc/sys too. It's the cost of providing functionality we just need
today.

I could understand arguing about things like: void *extra1; void
*extra2; in that very same structure, but not about something that
can't really be solved otherwise.

> I'm not averse to pollable sysfs/sysctl nodes at all although the memory
> hit on sysfs is going to be tricky to manage and need clever code.

Yeah, but not related to the problem this patch tries to solve.

> I just think the utsname is a completely misguided example and whoever is
> trying to do it doesn't actually understand the limits of utsname.

We are not talking about limits of a certain infrastructure. It is
used, it will not go away, we need to support it.

This is about propagating in-kernel state changes to userspace. Please
open a different conversation for everything else.

Thanks,
Kay

2011-06-02 13:56:39

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] sysctl: add support for poll()

Lucas De Marchi <[email protected]> writes:

> On Thu, Jun 2, 2011 at 9:43 AM, Alan Cox <[email protected]> wrote:
>>> The alternative is to have a process constantly polling and reading
>>> the file, which is nothing we even want to think about in 2011.
>>
>> Or to manage it properly.
>
> What if the user decides do invoke sethostname syscall "by hand"?
> Hostname would change beneath any other process that is trying to
> manage it properly. What this patch does is to notify that process
> that something happened.
>
>
>>> It's just another special case to bring us out of the UNIX stone age
>>> of doing things. :)
>>
>> Unfortunately not. It's a misguided attempt to follow stone age Unix 'one
>> short name' policy. Forget utsname node names, they are a historical
>> quirk of UUCP and friends and on many OS platforms will be limited to 15
>> chars !
>>
>> As to poll in general I can see some of the other proc files being
>> more relevant, eg for process monitoring tools being able to poll
>> in /proc/<pid> and some of the proc/sys and sysctl data that does change
>> meaningfully. Utsname however is not one of those things.
>>
>
> With this patch in, if anyone wants to manage a file under /proc/sys
> there's really a small amount of code to write. He only has to define
> the new poll struct for that file.

The support currently appears cumbersome to add, and it adds what appear
to be unnecessary wake ups (say when the hostname in another uts
namespace changes).

There is no explanation at all of why you care about the nis domainname.

Since there does not appear to be a specific problem that this problem
is being aimed at, since the code just looks like extra maintenance and
since the code needed to support this appears to be unnecessarily
cumbersome I am going to nack the patch for now.

Nacked-by: "Eric W. Biederman" <[email protected]>

If the goal here is just to fix the general case then we probably
want to get inotify going on proc files instead of poll. Either
that or we want pollable files to appear as something besides files
so that it is clear to their users that poll will work.

Eric

2011-06-02 14:16:49

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] sysctl: add support for poll()

Alan Cox <[email protected]> writes:

>> Host names are dynamic, can change during system runtime by dhcp or
>> similar setups, or just get changed by the user.
>
> I don't actually see what this has to do with utsname. uname historically
> defined nodename as "name within an implementation-defined communications
> network" and actually tended to be the UUCP name. Modern SuS says "`the
> name of the node of the communications network to which this node is
> attached, if any"

>
> The latter unfortunately makes no sense anyway and is a fine example of
> standards body cluelessness as name mapping on IP networks is not one
> name per host, and also because the standard doesn't require the fields
> in the struct are long enough to hold a DNS name!
>
> (Indeed in its usual head up backside manner its technically valid to
> define
>
> char nodename[1];
>
> and have only \0 as a valid reply)

However we have conveniently defined sethostname and gethostname to use
the same state in the kernel, as uname. I believe at least one of these
interfaces that map to the same storage in linux has a usable size
guaranteed by all of the implementations.

Eric

2011-06-02 17:33:07

by Kay Sievers

[permalink] [raw]
Subject: Re: [PATCH] sysctl: add support for poll()

On Thu, Jun 2, 2011 at 15:56, Eric W. Biederman <[email protected]> wrote:
> Lucas De Marchi <[email protected]> writes:

>> With this patch in, if anyone wants to manage a file under /proc/sys
>> there's really a small amount of code to write. He only has to define
>> the new poll struct for that file.
>
> The support currently appears cumbersome to add, and it adds what appear
> to be unnecessary wake ups (say when the hostname in another uts
> namespace changes).

Yeah, *possibly* waking up once a day compared to *unconditionally*
waking up every second in every namespace and check if something has
changed. If that *possibly* should be optimized, which I think isn't
necessary at all, I guess the logic could be moved down to the
namespaced data.

> There is no explanation at all of why you care about the nis domainname.

Just the same reason as the hostname, we need to know when the
kernel's internal state changes. regardless who did it and why, system
services need to follow it.

> Since there does not appear to be a specific problem that this problem
> is being aimed at, since the code just looks like extra maintenance and
> since the code needed to support this appears to be unnecessarily
> cumbersome I am going to nack the patch for now.
>
> Nacked-by: "Eric W. Biederman" <[email protected]>

Please provide solid technical reasons, why we can't to the same we do
everywhere else since years, or provide working alternatives. Until
then:

Acked-By: Kay Sievers <[email protected]>

> If the goal here is just to fix the general case then we probably
> want to get inotify going on proc files instead of poll.  Either
> that or we want pollable files to appear as something besides files
> so that it is clear to their users that poll will work.

I think poll() is the natural interface if you care about the data in
a file. It's the same an single fd you care, and not some iniotify
watch, fd, pathname to register, and filter for whatever comes out
there.

We already use exactly the same semantics for quite some years for
/proc/mounts, /proc/swaps, /proc/mdstat, ... and all over /sys. The
patch just provides the missing pieces that /proc provides, but
/proc/sys is missing.

I don't disagree, it might be nice to have generic inotify support on
/proc for this or other problems. But unless you want to work on it
now, and it would solve these problems, I don't see how we can get the
functionality we need, and that seems to solved with this patch.
Besides the fact that I think poll() is the simple and better
solution.

Thanks,
Kay

2011-06-08 22:27:02

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] sysctl: add support for poll()

On Thu, 2 Jun 2011 19:32:49 +0200
Kay Sievers <[email protected]> wrote:

> > Nacked-by: "Eric W. Biederman" <[email protected]>
>
> Acked-By: Kay Sievers <[email protected]>

The patch itself doesn't look too bad to me...

We already have several pollable procfs files, such as
fs/proc/base.c:mounts_poll() and I think drivers/md has one. I do
think that any work in this area should end up with those custom
make-procfs-pollable hacks being identified and removed.

2011-06-09 13:16:32

by Kay Sievers

[permalink] [raw]
Subject: Re: [PATCH] sysctl: add support for poll()

On Thu, Jun 9, 2011 at 00:17, Andrew Morton <[email protected]> wrote:
> On Thu, 2 Jun 2011 19:32:49 +0200
> Kay Sievers <[email protected]> wrote:
>
>> > Nacked-by: "Eric W. Biederman" <[email protected]>
>>
>> Acked-By: Kay Sievers <[email protected]>
>
> The patch itself doesn't look too bad to me...
>
> We already have several pollable procfs files, such as
> fs/proc/base.c:mounts_poll() and I think drivers/md has one.  I do
> think that any work in this area should end up with those custom
> make-procfs-pollable hacks being identified and removed.

For these files we can probably move the event counter into the
seq_file structure, and get rid of the dance to kmalloc it and assign
it to seq_file->private. That might simplify the logic a bit.

[Adding Neil, to get his opinion of moving 'event' so seq_file and get
rid of the malloc dance]

The wait_queue and the change counter needs to be in the same context
as the watched data, so we can probably not really help with generic
proc infrastructure here.

This patch is sysctl infrastructure, which is kind of a subclass of
proc like seq_file is. I have no good idea how to share anything
between them. Unlike the three current users of seq_file, the sysctl
stuff already moved the needed things to the common sysctl code.

If moving the individual event counter to seq_file makes sense, we can
give it a shot, but it don't think it really affects the sysctl case.
Maybe someone has good idea how to unify them, I currently don't have.

Kay

2011-06-12 15:34:59

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] sysctl: add support for poll()

Kay Sievers <[email protected]> writes:

> On Thu, Jun 2, 2011 at 15:56, Eric W. Biederman <[email protected]> wrote:
>> Lucas De Marchi <[email protected]> writes:
>
>>> With this patch in, if anyone wants to manage a file under /proc/sys
>>> there's really a small amount of code to write. He only has to define
>>> the new poll struct for that file.
>>
>> The support currently appears cumbersome to add, and it adds what appear
>> to be unnecessary wake ups (say when the hostname in another uts
>> namespace changes).
>
> Yeah, *possibly* waking up once a day compared to *unconditionally*
> waking up every second in every namespace and check if something has
> changed. If that *possibly* should be optimized, which I think isn't
> necessary at all, I guess the logic could be moved down to the
> namespaced data.

Unless you happen to be on a system, where someone has decided that it
has a daemon that likes creating a new set of namespaces per connection
to reduce the effect of code bugs. At which point you could be talking
much more than a wake up per second.

>> There is no explanation at all of why you care about the nis domainname.
>
> Just the same reason as the hostname, we need to know when the
> kernel's internal state changes. regardless who did it and why, system
> services need to follow it.

So the problem is that you have a system where userspace can't get it's
act together?

>> Since there does not appear to be a specific problem that this problem
>> is being aimed at, since the code just looks like extra maintenance and
>> since the code needed to support this appears to be unnecessarily
>> cumbersome I am going to nack the patch for now.
>>
>> Nacked-by: "Eric W. Biederman" <[email protected]>
>
> Please provide solid technical reasons, why we can't to the same we do
> everywhere else since years, or provide working alternatives. Until
> then:
>
> Acked-By: Kay Sievers <[email protected]>

Looking a little closer the patch is broken.

It changes the return of poll for every file under /proc/sys reporting
no file under /proc/sys is writable unless it implements the new poll
method.

Also with having the wait_queue owned by the calling code either I am
mistake or this breaks hotplug type situations. How do you get things
off of your wait queue when you remove a file from /proc/sys. This
infrastructure as written looks like a setup for use after free
problems.

> I think poll() is the natural interface if you care about the data in
> a file. It's the same an single fd you care, and not some iniotify
> watch, fd, pathname to register, and filter for whatever comes out
> there.

Sort of. poll is really designed for socket and pipe data.

The problem with poll is there is no POLLUPDATED.

> We already use exactly the same semantics for quite some years for
> /proc/mounts, /proc/swaps, /proc/mdstat, ... and all over /sys. The
> patch just provides the missing pieces that /proc provides, but
> /proc/sys is missing.

Good point.

There is still the problem that the infrastructure code for the proc
sysctl files are in much worse shape than the sysfs files.

> I don't disagree, it might be nice to have generic inotify support on
> /proc for this or other problems. But unless you want to work on it
> now, and it would solve these problems, I don't see how we can get the
> functionality we need, and that seems to solved with this patch.
> Besides the fact that I think poll() is the simple and better
> solution.

Which is a question that has never been answered. What functionality
do you need? To me this all looks like a bad science experiment.

Eric

2011-06-13 14:28:59

by Kay Sievers

[permalink] [raw]
Subject: Re: [PATCH] sysctl: add support for poll()

On Sun, Jun 12, 2011 at 17:34, Eric W. Biederman <[email protected]> wrote:
> Kay Sievers <[email protected]> writes:
>> On Thu, Jun 2, 2011 at 15:56, Eric W. Biederman <[email protected]> wrote:
>>> Lucas De Marchi <[email protected]> writes:
>>
>>>> With this patch in, if anyone wants to manage a file under /proc/sys
>>>> there's really a small amount of code to write. He only has to define
>>>> the new poll struct for that file.
>>>
>>> The support currently appears cumbersome to add, and it adds what appear
>>> to be unnecessary wake ups (say when the hostname in another uts
>>> namespace changes).
>>
>> Yeah, *possibly* waking up once a day compared to *unconditionally*
>> waking up every second in every namespace and check if something has
>> changed. If that *possibly* should be optimized, which I think isn't
>> necessary at all, I guess the logic could be moved down to the
>> namespaced data.
>
> Unless you happen to be on a system, where someone has decided that it
> has a daemon that likes creating a new set of namespaces per connection
> to reduce the effect of code bugs.  At which point you could be talking
> much more than a wake up per second.

What do you mean? We talk about changing the hostname, right? How
often do we expect that to happen?

You mean every network connection mounts a proc filesystem and runs a
service that polls hostname files there?

>>> There is no explanation at all of why you care about the nis domainname.
>>
>> Just the same reason as the hostname, we need to know when the
>> kernel's internal state changes. regardless who did it and why, system
>> services need to follow it.
>
> So the problem is that you have a system where userspace can't get it's
> act together?

There will never be a layer between the kernel and userspace that is
centrally managed. The kernel keeps the authoritative data, so we need
to know when _this_ data changes, not when some imaginary middle man
might manage it.

It will just never happen in reality, we stopped long ago being so
naive. And 'getting the act together' today is 'watching what people
did' to the kernel data, not tell them how to do it, or not to do it.

We can change some parts sometimes, and recommend some sanity, but
surely nobody will patch all the 15 dhcp clients to work with some
strange higher-level hostname interface here. People use the syscall
to set it, and there is nothing wrong with it, we just need to know
it.

>>> Since there does not appear to be a specific problem that this problem
>>> is being aimed at, since the code just looks like extra maintenance and
>>> since the code needed to support this appears to be unnecessarily
>>> cumbersome I am going to nack the patch for now.
>>>
>>> Nacked-by: "Eric W. Biederman" <[email protected]>
>>
>> Please provide solid technical reasons, why we can't to the same we do
>> everywhere else since years, or provide working alternatives. Until
>> then:
>>
>> Acked-By: Kay Sievers <[email protected]>
>
> Looking a little closer the patch is broken.
>
> It changes the return of poll for every file under /proc/sys reporting
> no file under /proc/sys is writable unless it implements the new poll
> method.

Poll is undefined for regular files, or proc/sys files which don't
implement it. What do you mean with writable? How does not providing
poll() affect write()?

> Also with having the wait_queue owned by the calling code either I am
> mistake or this breaks hotplug type situations.  How do you get things
> off of your wait queue when you remove a file from /proc/sys.
>
> This
> infrastructure as written looks like a setup for use after free
> problems.

The wait queue is in static non-allocated code. What do you mean?

>> I think poll() is the natural interface if you care about the data in
>> a file. It's the same an single fd you care, and not some iniotify
>> watch, fd, pathname to register, and filter for whatever comes out
>> there.
>
> Sort of.  poll is really designed for socket and pipe data.
>
> The problem with poll is there is no POLLUPDATED.

Right, hence we use POLLERR|POLLPRI, to indicate something has
changed. The case is kind of special because poll does not tell 'there
is more to read' but 'something happend, re-read it again'.

>> We already use exactly the same semantics for quite some years for
>> /proc/mounts, /proc/swaps, /proc/mdstat, ... and all over /sys. The
>> patch just provides the missing pieces that /proc provides, but
>> /proc/sys is missing.
>
> Good point.
>
> There is still the problem that the infrastructure code for the proc
> sysctl files are in much worse shape than the sysfs files.
>
>> I don't disagree, it might be nice to have generic inotify support on
>> /proc for this or other problems. But unless you want to work on it
>> now, and it would solve these problems, I don't see how we can get the
>> functionality we need, and that seems to solved with this patch.
>> Besides the fact that I think poll() is the simple and better
>> solution.
>
> Which is a question that has never been answered.

Sure, any other easy-to-use and easy-to-get-working interface you have
in mind? We can certainly use something else if it can provide us with
the updates we are looking for.

> What functionality
> do you need?  To me this all looks like a bad science experiment.

We need a simple interface to detect changes to the hostname data in
the kernel. It does not matter if the proc file is written or
sethostname() or hostname(1) is used. Any source of such change needs
to be propagated to running system services, to adapt to the new
hostname.

Kay

2011-06-13 16:05:59

by Kay Sievers

[permalink] [raw]
Subject: Re: [PATCH] sysctl: add support for poll()

On Thu, 2011-06-09 at 15:16 +0200, Kay Sievers wrote:
> On Thu, Jun 9, 2011 at 00:17, Andrew Morton <[email protected]> wrote:

> > We already have several pollable procfs files, such as
> > fs/proc/base.c:mounts_poll() and I think drivers/md has one. I do
> > think that any work in this area should end up with those custom
> > make-procfs-pollable hacks being identified and removed.
>
> For these files we can probably move the event counter into the
> seq_file structure, and get rid of the dance to kmalloc it and assign
> it to seq_file->private. That might simplify the logic a bit.
>
> [Adding Neil, to get his opinion of moving 'event' so seq_file and get
> rid of the malloc dance]

I guess, we could do something like this, which looks quite a bit
simpler by moving the poll event counter into the dynamically allocated
seq_file structure itself, instead of having private structures
allocated on top to just carry the counter (patch is just
compile-tested).

Thanks,
Kay

---
drivers/md/md.c | 26 ++++++++------------------
fs/namespace.c | 4 ++--
fs/proc/base.c | 2 +-
include/linux/mnt_namespace.h | 1 -
include/linux/seq_file.h | 1 +
mm/swapfile.c | 29 ++++++++---------------------
6 files changed, 20 insertions(+), 43 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index aa640a8..b073d36 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6377,16 +6377,11 @@ static void md_seq_stop(struct seq_file *seq, void *v)
mddev_put(mddev);
}

-struct mdstat_info {
- int event;
-};
-
static int md_seq_show(struct seq_file *seq, void *v)
{
mddev_t *mddev = v;
sector_t sectors;
mdk_rdev_t *rdev;
- struct mdstat_info *mi = seq->private;
struct bitmap *bitmap;

if (v == (void*)1) {
@@ -6398,7 +6393,7 @@ static int md_seq_show(struct seq_file *seq, void *v)

spin_unlock(&pers_lock);
seq_printf(seq, "\n");
- mi->event = atomic_read(&md_event_count);
+ seq->poll_event = atomic_read(&md_event_count);
return 0;
}
if (v == (void*)2) {
@@ -6510,26 +6505,21 @@ static const struct seq_operations md_seq_ops = {

static int md_seq_open(struct inode *inode, struct file *file)
{
+ struct seq_file *seq;
int error;
- struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
- if (mi == NULL)
- return -ENOMEM;

error = seq_open(file, &md_seq_ops);
if (error)
- kfree(mi);
- else {
- struct seq_file *p = file->private_data;
- p->private = mi;
- mi->event = atomic_read(&md_event_count);
- }
+ return error;
+
+ seq = file->private_data;
+ seq->poll_event = atomic_read(&md_event_count);
return error;
}

static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
{
- struct seq_file *m = filp->private_data;
- struct mdstat_info *mi = m->private;
+ struct seq_file *seq = filp->private_data;
int mask;

poll_wait(filp, &md_event_waiters, wait);
@@ -6537,7 +6527,7 @@ static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
/* always allow read */
mask = POLLIN | POLLRDNORM;

- if (mi->event != atomic_read(&md_event_count))
+ if (seq->poll_event != atomic_read(&md_event_count))
mask |= POLLERR | POLLPRI;
return mask;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index fe59bd1..cda50fe 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -934,8 +934,8 @@ int mnt_had_events(struct proc_mounts *p)
int res = 0;

br_read_lock(vfsmount_lock);
- if (p->event != ns->event) {
- p->event = ns->event;
+ if (p->m.poll_event != ns->event) {
+ p->m.poll_event = ns->event;
res = 1;
}
br_read_unlock(vfsmount_lock);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 14def99..16d33a3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -673,7 +673,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
p->m.private = p;
p->ns = ns;
p->root = root;
- p->event = ns->event;
+ p->m.poll_event = ns->event;

return 0;

diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 0b89efc..2930485 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -18,7 +18,6 @@ struct proc_mounts {
struct seq_file m; /* must be the first element */
struct mnt_namespace *ns;
struct path root;
- int event;
};

struct fs_struct;
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 03c0232..be720cd 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -23,6 +23,7 @@ struct seq_file {
u64 version;
struct mutex lock;
const struct seq_operations *op;
+ int poll_event;
void *private;
};

diff --git a/mm/swapfile.c b/mm/swapfile.c
index d537d29..5161d7d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1681,19 +1681,14 @@ out:
}

#ifdef CONFIG_PROC_FS
-struct proc_swaps {
- struct seq_file seq;
- int event;
-};
-
static unsigned swaps_poll(struct file *file, poll_table *wait)
{
- struct proc_swaps *s = file->private_data;
+ struct seq_file *seq = file->private_data;

poll_wait(file, &proc_poll_wait, wait);

- if (s->event != atomic_read(&proc_poll_event)) {
- s->event = atomic_read(&proc_poll_event);
+ if (seq->poll_event != atomic_read(&proc_poll_event)) {
+ seq->poll_event = atomic_read(&proc_poll_event);
return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
}

@@ -1783,24 +1778,16 @@ static const struct seq_operations swaps_op = {

static int swaps_open(struct inode *inode, struct file *file)
{
- struct proc_swaps *s;
+ struct seq_file *seq;
int ret;

- s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
- if (!s)
- return -ENOMEM;
-
- file->private_data = s;
-
ret = seq_open(file, &swaps_op);
- if (ret) {
- kfree(s);
+ if (ret)
return ret;
- }

- s->seq.private = s;
- s->event = atomic_read(&proc_poll_event);
- return ret;
+ seq = file->private_data;
+ seq->poll_event = atomic_read(&proc_poll_event);
+ return 0;
}

static const struct file_operations proc_swaps_operations = {


2011-06-14 03:52:43

by Lucas De Marchi

[permalink] [raw]
Subject: Re: [PATCH] sysctl: add support for poll()

* Kay Sievers <[email protected]> [2011-06-13 18:05:51 +0200]:

> On Thu, 2011-06-09 at 15:16 +0200, Kay Sievers wrote:
> > On Thu, Jun 9, 2011 at 00:17, Andrew Morton <[email protected]> wrote:
>
> > > We already have several pollable procfs files, such as
> > > fs/proc/base.c:mounts_poll() and I think drivers/md has one. I do
> > > think that any work in this area should end up with those custom
> > > make-procfs-pollable hacks being identified and removed.
> >
> > For these files we can probably move the event counter into the
> > seq_file structure, and get rid of the dance to kmalloc it and assign
> > it to seq_file->private. That might simplify the logic a bit.
> >
> > [Adding Neil, to get his opinion of moving 'event' so seq_file and get
> > rid of the malloc dance]
>
> I guess, we could do something like this, which looks quite a bit
> simpler by moving the poll event counter into the dynamically allocated
> seq_file structure itself, instead of having private structures
> allocated on top to just carry the counter (patch is just
> compile-tested).
>
> Thanks,
> Kay
>
> ---
> drivers/md/md.c | 26 ++++++++------------------
> fs/namespace.c | 4 ++--
> fs/proc/base.c | 2 +-
> include/linux/mnt_namespace.h | 1 -
> include/linux/seq_file.h | 1 +
> mm/swapfile.c | 29 ++++++++---------------------
> 6 files changed, 20 insertions(+), 43 deletions(-)

I've successfully tested this on top of 3.0-rc3 for /proc/mounts and
/proc/swaps. I've booted with systemd (that depends on mounts and swaps
being pollable) and created a small test using epoll.


Lucas De Marchi

2011-06-14 04:18:09

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH] sysctl: add support for poll()

On Mon, 13 Jun 2011 18:05:51 +0200 Kay Sievers <[email protected]> wrote:

> On Thu, 2011-06-09 at 15:16 +0200, Kay Sievers wrote:
> > On Thu, Jun 9, 2011 at 00:17, Andrew Morton <[email protected]> wrote:
>
> > > We already have several pollable procfs files, such as
> > > fs/proc/base.c:mounts_poll() and I think drivers/md has one. I do
> > > think that any work in this area should end up with those custom
> > > make-procfs-pollable hacks being identified and removed.
> >
> > For these files we can probably move the event counter into the
> > seq_file structure, and get rid of the dance to kmalloc it and assign
> > it to seq_file->private. That might simplify the logic a bit.
> >
> > [Adding Neil, to get his opinion of moving 'event' so seq_file and get
> > rid of the malloc dance]
>
> I guess, we could do something like this, which looks quite a bit
> simpler by moving the poll event counter into the dynamically allocated
> seq_file structure itself, instead of having private structures
> allocated on top to just carry the counter (patch is just
> compile-tested).
>
> Thanks,
> Kay

Acked-by: NeilBrown <[email protected]>

Looks like a nice improvement - thanks.

It is a pity that files don't all respond the same way to select/poll though.

Some set POLLIN|POLLRDNORM always, some set it only when there is a new event.

I have found that the former is safer. There are some frameworks that always
use select/poll before reading, and get confused when they cannot read from
some /proc file.

And uniformity is good anyway.

NeilBrown


>
> ---
> drivers/md/md.c | 26 ++++++++------------------
> fs/namespace.c | 4 ++--
> fs/proc/base.c | 2 +-
> include/linux/mnt_namespace.h | 1 -
> include/linux/seq_file.h | 1 +
> mm/swapfile.c | 29 ++++++++---------------------
> 6 files changed, 20 insertions(+), 43 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index aa640a8..b073d36 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -6377,16 +6377,11 @@ static void md_seq_stop(struct seq_file *seq, void *v)
> mddev_put(mddev);
> }
>
> -struct mdstat_info {
> - int event;
> -};
> -
> static int md_seq_show(struct seq_file *seq, void *v)
> {
> mddev_t *mddev = v;
> sector_t sectors;
> mdk_rdev_t *rdev;
> - struct mdstat_info *mi = seq->private;
> struct bitmap *bitmap;
>
> if (v == (void*)1) {
> @@ -6398,7 +6393,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
>
> spin_unlock(&pers_lock);
> seq_printf(seq, "\n");
> - mi->event = atomic_read(&md_event_count);
> + seq->poll_event = atomic_read(&md_event_count);
> return 0;
> }
> if (v == (void*)2) {
> @@ -6510,26 +6505,21 @@ static const struct seq_operations md_seq_ops = {
>
> static int md_seq_open(struct inode *inode, struct file *file)
> {
> + struct seq_file *seq;
> int error;
> - struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
> - if (mi == NULL)
> - return -ENOMEM;
>
> error = seq_open(file, &md_seq_ops);
> if (error)
> - kfree(mi);
> - else {
> - struct seq_file *p = file->private_data;
> - p->private = mi;
> - mi->event = atomic_read(&md_event_count);
> - }
> + return error;
> +
> + seq = file->private_data;
> + seq->poll_event = atomic_read(&md_event_count);
> return error;
> }
>
> static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
> {
> - struct seq_file *m = filp->private_data;
> - struct mdstat_info *mi = m->private;
> + struct seq_file *seq = filp->private_data;
> int mask;
>
> poll_wait(filp, &md_event_waiters, wait);
> @@ -6537,7 +6527,7 @@ static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
> /* always allow read */
> mask = POLLIN | POLLRDNORM;
>
> - if (mi->event != atomic_read(&md_event_count))
> + if (seq->poll_event != atomic_read(&md_event_count))
> mask |= POLLERR | POLLPRI;
> return mask;
> }
> diff --git a/fs/namespace.c b/fs/namespace.c
> index fe59bd1..cda50fe 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -934,8 +934,8 @@ int mnt_had_events(struct proc_mounts *p)
> int res = 0;
>
> br_read_lock(vfsmount_lock);
> - if (p->event != ns->event) {
> - p->event = ns->event;
> + if (p->m.poll_event != ns->event) {
> + p->m.poll_event = ns->event;
> res = 1;
> }
> br_read_unlock(vfsmount_lock);
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 14def99..16d33a3 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -673,7 +673,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
> p->m.private = p;
> p->ns = ns;
> p->root = root;
> - p->event = ns->event;
> + p->m.poll_event = ns->event;
>
> return 0;
>
> diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
> index 0b89efc..2930485 100644
> --- a/include/linux/mnt_namespace.h
> +++ b/include/linux/mnt_namespace.h
> @@ -18,7 +18,6 @@ struct proc_mounts {
> struct seq_file m; /* must be the first element */
> struct mnt_namespace *ns;
> struct path root;
> - int event;
> };
>
> struct fs_struct;
> diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
> index 03c0232..be720cd 100644
> --- a/include/linux/seq_file.h
> +++ b/include/linux/seq_file.h
> @@ -23,6 +23,7 @@ struct seq_file {
> u64 version;
> struct mutex lock;
> const struct seq_operations *op;
> + int poll_event;
> void *private;
> };
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index d537d29..5161d7d 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1681,19 +1681,14 @@ out:
> }
>
> #ifdef CONFIG_PROC_FS
> -struct proc_swaps {
> - struct seq_file seq;
> - int event;
> -};
> -
> static unsigned swaps_poll(struct file *file, poll_table *wait)
> {
> - struct proc_swaps *s = file->private_data;
> + struct seq_file *seq = file->private_data;
>
> poll_wait(file, &proc_poll_wait, wait);
>
> - if (s->event != atomic_read(&proc_poll_event)) {
> - s->event = atomic_read(&proc_poll_event);
> + if (seq->poll_event != atomic_read(&proc_poll_event)) {
> + seq->poll_event = atomic_read(&proc_poll_event);
> return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
> }
>
> @@ -1783,24 +1778,16 @@ static const struct seq_operations swaps_op = {
>
> static int swaps_open(struct inode *inode, struct file *file)
> {
> - struct proc_swaps *s;
> + struct seq_file *seq;
> int ret;
>
> - s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
> - if (!s)
> - return -ENOMEM;
> -
> - file->private_data = s;
> -
> ret = seq_open(file, &swaps_op);
> - if (ret) {
> - kfree(s);
> + if (ret)
> return ret;
> - }
>
> - s->seq.private = s;
> - s->event = atomic_read(&proc_poll_event);
> - return ret;
> + seq = file->private_data;
> + seq->poll_event = atomic_read(&proc_poll_event);
> + return 0;
> }
>
> static const struct file_operations proc_swaps_operations = {
>
>