2006-03-22 14:58:06

by Yi Yang

[permalink] [raw]
Subject: [2.6.16 PATCH] Connector: Filesystem Events Connector

This patch implements a new connector, Filesystem Event Connector,
the user can monitor filesystem activities via it, currently, it
can monitor access, attribute change, open, create, modify, delete,
move and close of any file or directory.

Every filesystem event will include tgid, uid and gid of the process
which triggered this event, process name, file or directory name
operated by it.

Filesystem events connector is never a duplicate of inotify, inotify
just concerns change on file or directory, Beagle uses it to watch
file changes in order to regenerate index for it, inotify can't tell
us who did that change and what is its process name, but filesystem
events connector can do these, moreover inotify's overhead is greater
than filesystem events connector, inotify needs compare inode with
watched file or directories list to decide whether it should generate an
inotify_event, some locks also increase overhead, filesystem event
connector hasn't these overhead, it just generates a fsevent and send.

To be important, filesystem event connector doesn't add any new system
call, the user space application can make use of it by netlink socket,
but inotify added several system calls, many events mechanism in kernel
have used netlink as communication way with user space, for example,
KOBJECT_UEVENT, PROC_EVENTS, to use netlink will make it more possible
to unify events interface to netlink, the user space application can use
it very easy.

Signed-off-by: Yi Yang <[email protected]>

--- a/include/linux/connector.h.orig 2006-03-15 23:21:37.000000000 +0800
+++ b/include/linux/connector.h 2006-03-15 23:23:09.000000000 +0800
@@ -34,6 +34,8 @@
#define CN_VAL_PROC 0x1
#define CN_IDX_CIFS 0x2
#define CN_VAL_CIFS 0x1
+#define CN_IDX_FS 0x3
+#define CN_VAL_FS 0x1

#define CN_NETLINK_USERS 1

--- a/include/linux/fsnotify.h.orig 2006-01-03 11:21:10.000000000 +0800
+++ b/include/linux/fsnotify.h 2006-03-22 21:48:24.000000000 +0800
@@ -15,6 +15,7 @@

#include <linux/dnotify.h>
#include <linux/inotify.h>
+#include <linux/fsevent.h>

/*
* fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
@@ -45,6 +46,8 @@ static inline void fsnotify_move(struct
if (source) {
inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL);
}
+ raise_fsevent_move(old_dir, old_name, new_dir, new_name,
+ FSEVENT_MOVE | (isdir?FSEVENT_ISDIR:0));
}

/*
@@ -56,6 +59,8 @@ static inline void fsnotify_nameremove(s
isdir = IN_ISDIR;
dnotify_parent(dentry, DN_DELETE);
inotify_dentry_parent_queue_event(dentry, IN_DELETE|isdir, 0, dentry->d_name.name);
+ raise_fsevent(dentry,
+ FSEVENT_DELETE | (isdir?FSEVENT_ISDIR:0));
}

/*
@@ -74,6 +79,7 @@ static inline void fsnotify_create(struc
{
inode_dir_notify(inode, DN_CREATE);
inotify_inode_queue_event(inode, IN_CREATE, 0, name);
+ raise_fsevent_create(inode, name, FSEVENT_CREATE);
}

/*
@@ -83,6 +89,8 @@ static inline void fsnotify_mkdir(struct
{
inode_dir_notify(inode, DN_CREATE);
inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, name);
+ raise_fsevent_create(inode, name,
+ FSEVENT_CREATE | FSEVENT_ISDIR);
}

/*
@@ -99,6 +107,8 @@ static inline void fsnotify_access(struc
dnotify_parent(dentry, DN_ACCESS);
inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
inotify_inode_queue_event(inode, mask, 0, NULL);
+ raise_fsevent(dentry, FSEVENT_ACCESS |
+ ((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
}

/*
@@ -115,6 +125,8 @@ static inline void fsnotify_modify(struc
dnotify_parent(dentry, DN_MODIFY);
inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
inotify_inode_queue_event(inode, mask, 0, NULL);
+ raise_fsevent(dentry, FSEVENT_MODIFY |
+ ((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
}

/*
@@ -130,6 +142,9 @@ static inline void fsnotify_open(struct

inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
inotify_inode_queue_event(inode, mask, 0, NULL);
+ raise_fsevent(dentry, FSEVENT_OPEN |
+ ((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
+
}

/*
@@ -148,6 +163,8 @@ static inline void fsnotify_close(struct

inotify_dentry_parent_queue_event(dentry, mask, 0, name);
inotify_inode_queue_event(inode, mask, 0, NULL);
+ raise_fsevent(dentry, FSEVENT_CLOSE |
+ ((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
}

/*
@@ -163,6 +180,8 @@ static inline void fsnotify_xattr(struct

inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
inotify_inode_queue_event(inode, mask, 0, NULL);
+ raise_fsevent(dentry, FSEVENT_MODIFY_ATTRIB |
+ ((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
}

/*
@@ -213,6 +232,24 @@ static inline void fsnotify_change(struc
inotify_dentry_parent_queue_event(dentry, in_mask, 0,
dentry->d_name.name);
}
+
+#ifdef CONFIG_FS_EVENTS
+ {
+ u32 fsevent_mask = 0;
+ if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE))
+ fsevent_mask |= FSEVENT_MODIFY_ATTRIB;
+ if ((ia_valid & ATTR_ATIME) && (ia_valid & ATTR_MTIME))
+ fsevent_mask |= FSEVENT_MODIFY_ATTRIB;
+ else if (ia_valid & ATTR_ATIME)
+ fsevent_mask |= FSEVENT_ACCESS;
+ else if (ia_valid & ATTR_MTIME)
+ fsevent_mask |= FSEVENT_MODIFY;
+ if (ia_valid & ATTR_SIZE)
+ fsevent_mask |= FSEVENT_MODIFY;
+ if (fsevent_mask)
+ raise_fsevent(dentry, fsevent_mask);
+ }
+#endif /* CONFIG_FS_EVENTS */
}

#ifdef CONFIG_INOTIFY /* inotify helpers */
--- a/drivers/connector/Makefile.orig 2006-03-12 21:23:18.000000000 +0800
+++ b/drivers/connector/Makefile 2006-03-12 21:24:25.000000000 +0800
@@ -1,4 +1,5 @@
obj-$(CONFIG_CONNECTOR) += cn.o
obj-$(CONFIG_PROC_EVENTS) += cn_proc.o
+obj-$(CONFIG_FS_EVENTS) += cn_fs.o

cn-y += cn_queue.o connector.o
--- a/drivers/connector/Kconfig.orig 2006-03-12 21:23:53.000000000 +0800
+++ b/drivers/connector/Kconfig 2006-03-12 21:21:44.000000000 +0800
@@ -18,4 +18,12 @@ config PROC_EVENTS
Provide a connector that reports process events to userspace. Send
events such as fork, exec, id change (uid, gid, suid, etc), and exit.

+config FS_EVENTS
+ boolean "Report filesystem events to userspace"
+ depends on CONNECTOR=y
+ default y
+ ---help---
+ Provide a connector that reports filesystem events to userspace. The
+ reported event include open, read, write.
+
endmenu
--- /dev/null 2003-01-30 18:24:37.000000000 +0800
+++ b/include/linux/fsevent.h 2006-03-16 22:52:06.000000000 +0800
@@ -0,0 +1,86 @@
+/*
+ * fsevent.h - filesystem events connector
+ *
+ * Copyright (C) 2006 Yi Yang <[email protected]>
+ * Based on cn_proc.h by Matt Helsley, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef LINUX_FSEVENT_H
+#define LINUX_FSEVENT_H
+
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/connector.h>
+
+enum fsevent_type {
+ FSEVENT_ACCESS = 0x00000001, /* File was accessed */
+ FSEVENT_MODIFY = 0x00000002, /* File was modified */
+ FSEVENT_MODIFY_ATTRIB = 0x00000004, /* Metadata changed */
+ FSEVENT_CLOSE = 0x00000008, /* File was closed */
+ FSEVENT_OPEN = 0x00000010, /* File was opened */
+ FSEVENT_MOVE = 0x00000020, /* File was moved */
+ FSEVENT_CREATE = 0x00000040, /* File was created */
+ FSEVENT_DELETE = 0x00000080, /* File was deleted */
+ FSEVENT_CMDACK = 0x40000000, /* For used internally */
+ FSEVENT_ISDIR = 0x80000000 /* It is set for a dir */
+};
+
+/*
+ * Userspace sends this enum to register with the kernel that it is listening
+ * for events on the connector.
+ */
+enum fsevent_mode {
+ FSEVENT_LISTEN = 1,
+ FSEVENT_IGNORE = 2
+};
+
+struct fsevent {
+ __u32 type;
+ __u32 cpu;
+ struct timespec timestamp;
+ __u32 tgid;
+ __u32 uid;
+ __u32 gid;
+ __u32 err;
+ __u32 len;
+ __u32 pname_len;
+ __u32 fname_len;
+ __u32 new_fname_len;
+ char name[0];
+};
+
+#ifdef __KERNEL__
+#ifdef CONFIG_FS_EVENTS
+extern void raise_fsevent(struct dentry * dentryp, u32 mask);
+extern void raise_fsevent_move(struct inode * olddir, const char * oldname,
+ struct inode * newdir, const char * newname, u32 mask);
+extern void raise_fsevent_create(struct inode * inode,
+ const char * name, u32 mask);
+#else
+static void raise_fsevent(struct dentry * dentryp, u32 mask);
+{}
+
+static void raise_fsevent_move(struct inode * olddir, const char * oldname,
+ struct inode * newdir, const char * newname, u32 mask)
+{}
+
+static void raise_fsevent_create(struct inode * inode,
+ const char * name, u32 mask)
+{}
+#endif /* CONFIG_FS_EVENTS */
+#endif /* __KERNEL__ */
+#endif /* LINUX_FSEVENT_H */
--- /dev/null 2003-01-30 18:24:37.000000000 +0800
+++ b/drivers/connector/cn_fs.c 2006-03-16 22:54:28.000000000 +0800
@@ -0,0 +1,200 @@
+/*
+ * cn_fs.c - filesystem events connector
+ *
+ * Copyright (C) 2006 Yi Yang <[email protected]>
+ * Based on cn_proc.c by Matt Helsley, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/ktime.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/fsevent.h>
+#include <asm/atomic.h>
+
+#define CN_FS_MSG_SIZE (sizeof(struct cn_msg) + sizeof(struct fsevent))
+
+static atomic_t cn_fs_event_listeners = ATOMIC_INIT(0);
+static struct cb_id cn_fs_event_id = { CN_IDX_FS, CN_VAL_FS };
+
+static DEFINE_PER_CPU(__u32, cn_fs_event_counts) = { 0 };
+static int fsevent_burst_limit = 100;
+static int fsevent_ratelimit = 5 * HZ;
+
+static inline void get_seq(__u32 *ts, int *cpu)
+{
+ *ts = get_cpu_var(cn_fs_event_counts)++;
+ *cpu = smp_processor_id();
+ put_cpu_var(cn_fs_event_counts);
+}
+
+int __raise_fsevent(const char * oldname, const char * newname, u32 mask)
+{
+ struct cn_msg *msg;
+ struct fsevent *event;
+ __u8 * buffer;
+ int namelen = 0;
+ static unsigned long last = 0;
+ static int fsevent_sum = 0;
+
+ if (atomic_read(&cn_fs_event_listeners) < 1)
+ return 0;
+
+ if (jiffies - last <= fsevent_ratelimit) {
+ if (fsevent_sum > fsevent_burst_limit) {
+ return -1;
+ }
+ fsevent_sum++;
+ }
+ else {
+ last = jiffies;
+ fsevent_sum = 0;
+ }
+
+ namelen = strlen(current->comm) + strlen(oldname) + 2;
+ if (newname) {
+ namelen += strlen(newname) + 1;
+ }
+
+ if ((buffer = (__u8 *)kmalloc(CN_FS_MSG_SIZE + namelen, GFP_KERNEL))
+ == NULL) {
+ printk("cn_fs: out of memory\n");
+ return -1;
+ }
+
+ msg = (struct cn_msg*)buffer;
+ event = (struct fsevent*)msg->data;
+ get_seq(&msg->seq, &event->cpu);
+ ktime_get_ts(&event->timestamp);
+ event->type = mask;
+ event->tgid = current->tgid;
+ event->uid = current->uid;
+ event->gid = current->gid;
+ event->pname_len = strlen(current->comm);
+ memcpy(event->name, current->comm, event->pname_len);
+ event->name[event->pname_len] = '\0';
+ event->fname_len = strlen(oldname);
+ memcpy(event->name + event->pname_len + 1, oldname, strlen(oldname));
+ event->len = event->pname_len + event->fname_len + 2;
+ event->name[event->len-1] = '\0';
+ event->new_fname_len = 0;
+ if (newname) {
+ event->new_fname_len = strlen(newname);
+ memcpy(event->name + event->len, newname, strlen(newname));
+ event->len += event->new_fname_len + 1;
+ event->name[event->len-1] = '\0';
+ }
+
+ memcpy(&msg->id, &cn_fs_event_id, sizeof(msg->id));
+ msg->ack = 0;
+ msg->len = sizeof(struct fsevent) + event->len;
+ cn_netlink_send(msg, CN_IDX_FS, GFP_KERNEL);
+ kfree(buffer);
+ return 0;
+}
+
+void raise_fsevent(struct dentry * dentryp, u32 mask)
+{
+ __raise_fsevent(dentryp->d_name.name, NULL, mask);
+}
+EXPORT_SYMBOL_GPL(raise_fsevent);
+
+void raise_fsevent_create(struct inode * inode, const char * name, u32 mask)
+{
+ __raise_fsevent(name, NULL, mask);
+}
+EXPORT_SYMBOL_GPL(raise_fsevent_create);
+
+void raise_fsevent_move(struct inode * olddir, const char * oldname,
+ struct inode * newdir, const char * newname, u32 mask)
+{
+ __raise_fsevent(oldname, newname, mask);
+}
+EXPORT_SYMBOL_GPL(raise_fsevent_move);
+
+static void cn_fs_event_ack(int err, int rcvd_seq, int rcvd_ack)
+{
+ struct cn_msg *msg;
+ struct fsevent *event;
+ __u8 * buffer = NULL;
+
+ if (atomic_read(&cn_fs_event_listeners) < 1)
+ return;
+
+ if ((buffer = (__u8 *)kmalloc(CN_FS_MSG_SIZE, GFP_KERNEL)) == NULL) {
+ printk("cn_fs: out of memory\n");
+ return;
+ }
+
+ msg = (struct cn_msg*)buffer;
+ event = (struct fsevent*)msg->data;
+ msg->seq = rcvd_seq;
+ ktime_get_ts(&event->timestamp);
+ event->cpu = -1;
+ event->type = FSEVENT_CMDACK;
+ event->tgid = 0;
+ event->uid = 0;
+ event->gid = 0;
+ event->len = 0;
+ event->pname_len = 0;
+ event->fname_len = 0;
+ event->new_fname_len = 0;
+ event->err = err;
+ memcpy(&msg->id, &cn_fs_event_id, sizeof(msg->id));
+ msg->ack = rcvd_ack + 1;
+ msg->len = sizeof(struct fsevent);
+ cn_netlink_send(msg, CN_IDX_FS, GFP_KERNEL);
+}
+
+static void cn_fsevent_mode_ctl(void *data)
+{
+ struct cn_msg *msg = data;
+ enum fsevent_mode *mode = NULL;
+ int err = 0;
+
+ if (msg->len != sizeof(*mode))
+ return;
+
+ mode = (enum fsevent_mode*)msg->data;
+ switch (*mode) {
+ case FSEVENT_LISTEN:
+ atomic_inc(&cn_fs_event_listeners);
+ break;
+ case FSEVENT_IGNORE:
+ atomic_dec(&cn_fs_event_listeners);
+ break;
+ default:
+ err = EINVAL;
+ break;
+ }
+ cn_fs_event_ack(err, msg->seq, msg->ack);
+}
+
+static int __init cn_fs_init(void)
+{
+ int err;
+
+ if ((err = cn_add_callback(&cn_fs_event_id, "cn_fs",
+ &cn_fsevent_mode_ctl))) {
+ printk(KERN_WARNING "cn_fs failed to register\n");
+ return err;
+ }
+ return 0;
+}
+
+module_init(cn_fs_init);



2006-03-22 20:15:01

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [2.6.16 PATCH] Connector: Filesystem Events Connector

Quoting Yi Yang ([email protected]):
> +#ifdef __KERNEL__
> +#ifdef CONFIG_FS_EVENTS
> +extern void raise_fsevent(struct dentry * dentryp, u32 mask);
> +extern void raise_fsevent_move(struct inode * olddir, const char * oldname,
> + struct inode * newdir, const char * newname, u32 mask);
> +extern void raise_fsevent_create(struct inode * inode,
> + const char * name, u32 mask);
> +#else
> +static void raise_fsevent(struct dentry * dentryp, u32 mask);
> +{}

Hmm, this compiles, if !CONFIG_FS_EVENTS?


-serge

2006-03-23 07:55:21

by Matt Helsley

[permalink] [raw]
Subject: Re: [2.6.16 PATCH] Connector: Filesystem Events Connector

On Wed, 2006-03-22 at 22:58 +0800, Yi Yang wrote:
> This patch implements a new connector, Filesystem Event Connector,
> the user can monitor filesystem activities via it, currently, it
> can monitor access, attribute change, open, create, modify, delete,
> move and close of any file or directory.
>
> Every filesystem event will include tgid, uid and gid of the process
> which triggered this event, process name, file or directory name
> operated by it.
>
> Filesystem events connector is never a duplicate of inotify, inotify
> just concerns change on file or directory, Beagle uses it to watch
> file changes in order to regenerate index for it, inotify can't tell
> us who did that change and what is its process name, but filesystem
> events connector can do these, moreover inotify's overhead is greater
> than filesystem events connector, inotify needs compare inode with
> watched file or directories list to decide whether it should generate an
> inotify_event, some locks also increase overhead, filesystem event
> connector hasn't these overhead, it just generates a fsevent and send.
>
> To be important, filesystem event connector doesn't add any new system
> call, the user space application can make use of it by netlink socket,
> but inotify added several system calls, many events mechanism in kernel
> have used netlink as communication way with user space, for example,
> KOBJECT_UEVENT, PROC_EVENTS, to use netlink will make it more possible
> to unify events interface to netlink, the user space application can use
> it very easy.
>
> Signed-off-by: Yi Yang <[email protected]>
>
> --- a/include/linux/connector.h.orig 2006-03-15 23:21:37.000000000 +0800
> +++ b/include/linux/connector.h 2006-03-15 23:23:09.000000000 +0800
> @@ -34,6 +34,8 @@
> #define CN_VAL_PROC 0x1
> #define CN_IDX_CIFS 0x2
> #define CN_VAL_CIFS 0x1
> +#define CN_IDX_FS 0x3
> +#define CN_VAL_FS 0x1
>
> #define CN_NETLINK_USERS 1
>
> --- a/include/linux/fsnotify.h.orig 2006-01-03 11:21:10.000000000 +0800
> +++ b/include/linux/fsnotify.h 2006-03-22 21:48:24.000000000 +0800
> @@ -15,6 +15,7 @@
>
> #include <linux/dnotify.h>
> #include <linux/inotify.h>
> +#include <linux/fsevent.h>
>
> /*
> * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
> @@ -45,6 +46,8 @@ static inline void fsnotify_move(struct
> if (source) {
> inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL);
> }
> + raise_fsevent_move(old_dir, old_name, new_dir, new_name,
> + FSEVENT_MOVE | (isdir?FSEVENT_ISDIR:0));
> }
>
> /*
> @@ -56,6 +59,8 @@ static inline void fsnotify_nameremove(s
> isdir = IN_ISDIR;
> dnotify_parent(dentry, DN_DELETE);
> inotify_dentry_parent_queue_event(dentry, IN_DELETE|isdir, 0, dentry->d_name.name);
> + raise_fsevent(dentry,
> + FSEVENT_DELETE | (isdir?FSEVENT_ISDIR:0));
> }
>
> /*
> @@ -74,6 +79,7 @@ static inline void fsnotify_create(struc
> {
> inode_dir_notify(inode, DN_CREATE);
> inotify_inode_queue_event(inode, IN_CREATE, 0, name);
> + raise_fsevent_create(inode, name, FSEVENT_CREATE);
> }
>
> /*
> @@ -83,6 +89,8 @@ static inline void fsnotify_mkdir(struct
> {
> inode_dir_notify(inode, DN_CREATE);
> inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, name);
> + raise_fsevent_create(inode, name,
> + FSEVENT_CREATE | FSEVENT_ISDIR);
> }
>
> /*
> @@ -99,6 +107,8 @@ static inline void fsnotify_access(struc
> dnotify_parent(dentry, DN_ACCESS);
> inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
> inotify_inode_queue_event(inode, mask, 0, NULL);
> + raise_fsevent(dentry, FSEVENT_ACCESS |
> + ((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
> }
>
> /*
> @@ -115,6 +125,8 @@ static inline void fsnotify_modify(struc
> dnotify_parent(dentry, DN_MODIFY);
> inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
> inotify_inode_queue_event(inode, mask, 0, NULL);
> + raise_fsevent(dentry, FSEVENT_MODIFY |
> + ((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
> }
>
> /*
> @@ -130,6 +142,9 @@ static inline void fsnotify_open(struct
>
> inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
> inotify_inode_queue_event(inode, mask, 0, NULL);
> + raise_fsevent(dentry, FSEVENT_OPEN |
> + ((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
> +
> }
>
> /*
> @@ -148,6 +163,8 @@ static inline void fsnotify_close(struct
>
> inotify_dentry_parent_queue_event(dentry, mask, 0, name);
> inotify_inode_queue_event(inode, mask, 0, NULL);
> + raise_fsevent(dentry, FSEVENT_CLOSE |
> + ((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
> }
>
> /*
> @@ -163,6 +180,8 @@ static inline void fsnotify_xattr(struct
>
> inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
> inotify_inode_queue_event(inode, mask, 0, NULL);
> + raise_fsevent(dentry, FSEVENT_MODIFY_ATTRIB |
> + ((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
> }
>
> /*
> @@ -213,6 +232,24 @@ static inline void fsnotify_change(struc
> inotify_dentry_parent_queue_event(dentry, in_mask, 0,
> dentry->d_name.name);
> }
> +
> +#ifdef CONFIG_FS_EVENTS
> + {
> + u32 fsevent_mask = 0;
> + if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE))
> + fsevent_mask |= FSEVENT_MODIFY_ATTRIB;
> + if ((ia_valid & ATTR_ATIME) && (ia_valid & ATTR_MTIME))
> + fsevent_mask |= FSEVENT_MODIFY_ATTRIB;
> + else if (ia_valid & ATTR_ATIME)
> + fsevent_mask |= FSEVENT_ACCESS;
> + else if (ia_valid & ATTR_MTIME)
> + fsevent_mask |= FSEVENT_MODIFY;
> + if (ia_valid & ATTR_SIZE)
> + fsevent_mask |= FSEVENT_MODIFY;
> + if (fsevent_mask)
> + raise_fsevent(dentry, fsevent_mask);
> + }
> +#endif /* CONFIG_FS_EVENTS */
> }
>
> #ifdef CONFIG_INOTIFY /* inotify helpers */
> --- a/drivers/connector/Makefile.orig 2006-03-12 21:23:18.000000000 +0800
> +++ b/drivers/connector/Makefile 2006-03-12 21:24:25.000000000 +0800
> @@ -1,4 +1,5 @@
> obj-$(CONFIG_CONNECTOR) += cn.o
> obj-$(CONFIG_PROC_EVENTS) += cn_proc.o
> +obj-$(CONFIG_FS_EVENTS) += cn_fs.o
>
> cn-y += cn_queue.o connector.o
> --- a/drivers/connector/Kconfig.orig 2006-03-12 21:23:53.000000000 +0800
> +++ b/drivers/connector/Kconfig 2006-03-12 21:21:44.000000000 +0800
> @@ -18,4 +18,12 @@ config PROC_EVENTS
> Provide a connector that reports process events to userspace. Send
> events such as fork, exec, id change (uid, gid, suid, etc), and exit.
>
> +config FS_EVENTS
> + boolean "Report filesystem events to userspace"
> + depends on CONNECTOR=y
> + default y
> + ---help---
> + Provide a connector that reports filesystem events to userspace. The
> + reported event include open, read, write.
> +
> endmenu
> --- /dev/null 2003-01-30 18:24:37.000000000 +0800
> +++ b/include/linux/fsevent.h 2006-03-16 22:52:06.000000000 +0800
> @@ -0,0 +1,86 @@
> +/*
> + * fsevent.h - filesystem events connector
> + *
> + * Copyright (C) 2006 Yi Yang <[email protected]>
> + * Based on cn_proc.h by Matt Helsley, IBM Corp
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
> + */
> +
> +#ifndef LINUX_FSEVENT_H
> +#define LINUX_FSEVENT_H
> +
> +#include <linux/types.h>
> +#include <linux/time.h>
> +#include <linux/connector.h>
> +
> +enum fsevent_type {
> + FSEVENT_ACCESS = 0x00000001, /* File was accessed */
> + FSEVENT_MODIFY = 0x00000002, /* File was modified */
> + FSEVENT_MODIFY_ATTRIB = 0x00000004, /* Metadata changed */
> + FSEVENT_CLOSE = 0x00000008, /* File was closed */
> + FSEVENT_OPEN = 0x00000010, /* File was opened */
> + FSEVENT_MOVE = 0x00000020, /* File was moved */
> + FSEVENT_CREATE = 0x00000040, /* File was created */
> + FSEVENT_DELETE = 0x00000080, /* File was deleted */
> + FSEVENT_CMDACK = 0x40000000, /* For used internally */

s/used/use/

> + FSEVENT_ISDIR = 0x80000000 /* It is set for a dir */
> +};

So we'd have DN_FOO, IN_FOO, and FSEVENT_FOO? Any chance inotify
definitions could be used instead of redefining all of these? Perhaps
that would simplify the code in fsnotify.h.

The only problem I can see is the FSEVENT_CMDACK type. This could be
avoided by breaking up the type field into an inotify-compatible mask
field and the type field (EVENT or CMDACK).

> +/*
> + * Userspace sends this enum to register with the kernel that it is listening
> + * for events on the connector.
> + */
> +enum fsevent_mode {
> + FSEVENT_LISTEN = 1,
> + FSEVENT_IGNORE = 2
> +};
> +

Process Events Connector uses this mechanism to avoid most of the event
generation code if there are no listeners.

Michael Kerrisk has privately suggested to me that this mechanism gives
userspace too much rope with which to hang itself. I think it just gives
userspace more rope.

That said, perhaps we can shorten the rope by adding a connector
function to quickly return a value indicating if a process in userspace
is listening to messages sent by the kernel. Then connectors could use
that function rather that reinvent the same mechanism.

> +struct fsevent {
> + __u32 type;
> + __u32 cpu;
> + struct timespec timestamp;
> + __u32 tgid;

I think pid_t would be more appropriate here instead of a __u32.
Also a tid field of type pid_t would be consistent with the Process
Events returned to userspace. If a userspace program ever wanted to
relate the two sets of events the tid field could be important. That
said it would be nice to know if any userspace programs are planning on
using this.

> + __u32 uid;
> + __u32 gid;
> + __u32 err;

The err field appears to be unused when sending events back to
userspace. I suggest reusing it for the event mask and using the type
field to indicate whether the message is a CMDACK or simply an EVENT.

> + __u32 len;
> + __u32 pname_len;
> + __u32 fname_len;
> + __u32 new_fname_len;

I think one of these _len fields could be dropped and later calculated
as:

new_fname_len = len - (pname_len + fname_len);

> + char name[0];
> +};
> +
> +#ifdef __KERNEL__
> +#ifdef CONFIG_FS_EVENTS
> +extern void raise_fsevent(struct dentry * dentryp, u32 mask);
> +extern void raise_fsevent_move(struct inode * olddir, const char * oldname,
> + struct inode * newdir, const char * newname, u32 mask);
> +extern void raise_fsevent_create(struct inode * inode,
> + const char * name, u32 mask);
> +#else
> +static void raise_fsevent(struct dentry * dentryp, u32 mask);
> +{}
> +
> +static void raise_fsevent_move(struct inode * olddir, const char * oldname,
> + struct inode * newdir, const char * newname, u32 mask)
> +{}
> +
> +static void raise_fsevent_create(struct inode * inode,
> + const char * name, u32 mask)
> +{}
> +#endif /* CONFIG_FS_EVENTS */
> +#endif /* __KERNEL__ */
> +#endif /* LINUX_FSEVENT_H */
> --- /dev/null 2003-01-30 18:24:37.000000000 +0800
> +++ b/drivers/connector/cn_fs.c 2006-03-16 22:54:28.000000000 +0800
> @@ -0,0 +1,200 @@
> +/*
> + * cn_fs.c - filesystem events connector
> + *
> + * Copyright (C) 2006 Yi Yang <[email protected]>
> + * Based on cn_proc.c by Matt Helsley, IBM Corp
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
> + */
> +
> +#include <linux/module.h>
> +#include <linux/kernel.h>
> +#include <linux/ktime.h>
> +#include <linux/init.h>
> +#include <linux/fs.h>
> +#include <linux/fsevent.h>
> +#include <asm/atomic.h>
> +
> +#define CN_FS_MSG_SIZE (sizeof(struct cn_msg) + sizeof(struct fsevent))
> +
> +static atomic_t cn_fs_event_listeners = ATOMIC_INIT(0);
> +static struct cb_id cn_fs_event_id = { CN_IDX_FS, CN_VAL_FS };
> +
> +static DEFINE_PER_CPU(__u32, cn_fs_event_counts) = { 0 };
> +static int fsevent_burst_limit = 100;
> +static int fsevent_ratelimit = 5 * HZ;
> +
> +static inline void get_seq(__u32 *ts, int *cpu)
> +{
> + *ts = get_cpu_var(cn_fs_event_counts)++;
> + *cpu = smp_processor_id();
> + put_cpu_var(cn_fs_event_counts);
> +}
> +
> +int __raise_fsevent(const char * oldname, const char * newname, u32 mask)

The return value of this function does not appear to be used.

> +{
> + struct cn_msg *msg;
> + struct fsevent *event;
> + __u8 * buffer;

While the contents of buffer are being sent to userspace eventually, the
pointer is not, so I don't think __u8 is necessary or conventional.

> + int namelen = 0;
> + static unsigned long last = 0;
> + static int fsevent_sum = 0;

Yuck, static local variables. IMHO these should be globals. It would
make the fact they aren't protected from concurrent access more obvious
(see below).

> + if (atomic_read(&cn_fs_event_listeners) < 1)
> + return 0;
> +
> + if (jiffies - last <= fsevent_ratelimit) {
> + if (fsevent_sum > fsevent_burst_limit) {
> + return -1;

OK, so you're rate limiting the events. Shouldn't you still boost the
sequence number so that userspace knows some events got dropped? Also
perhaps you can find an appropriate error to return instead of -1.

> + }

remove unecessary braces

> + fsevent_sum++;

Looks racy to me -- what's protecting fsevent_sum from access by
multiple cpus?

> + }
> + else {

I seem to recall that coding style would have you move the else up.

> + last = jiffies;
> + fsevent_sum = 0;
> + }
> +
> + namelen = strlen(current->comm) + strlen(oldname) + 2;
> + if (newname) {
> + namelen += strlen(newname) + 1;
> + }

remove braces

> + if ((buffer = (__u8 *)kmalloc(CN_FS_MSG_SIZE + namelen, GFP_KERNEL))
> + == NULL) {

Pull the assignment out of the condition. You're not saving any space by
putting it into the if () and it's harder to read. I don't think the
__u8 cast is necessary..

> + printk("cn_fs: out of memory\n");

missing printk tag

> + return -1;

Perhaps:
return -ENOMEM;

> + }
> +
> + msg = (struct cn_msg*)buffer;
> + event = (struct fsevent*)msg->data;
> + get_seq(&msg->seq, &event->cpu);
> + ktime_get_ts(&event->timestamp);
> + event->type = mask;
> + event->tgid = current->tgid;
> + event->uid = current->uid;
> + event->gid = current->gid;
> + event->pname_len = strlen(current->comm);
> + memcpy(event->name, current->comm, event->pname_len);
> + event->name[event->pname_len] = '\0';
> + event->fname_len = strlen(oldname);
> + memcpy(event->name + event->pname_len + 1, oldname, strlen(oldname));
> + event->len = event->pname_len + event->fname_len + 2;
> + event->name[event->len-1] = '\0';
> + event->new_fname_len = 0;
> + if (newname) {
> + event->new_fname_len = strlen(newname);
> + memcpy(event->name + event->len, newname, strlen(newname));
> + event->len += event->new_fname_len + 1;
> + event->name[event->len-1] = '\0';
> + }


Maybe it's just my personal taste, but you could use pointer arithmetic
while copying the strings into the event buffer:

name = event->name;

strncpy(name, current->comm, event->pname_len);
name += event->pname_len;
name[0] = '\0';
name++;

strncpy(name, oldname, event->fname_len);
name += event->fname_len;
name[0] = '\0';
name++;
...

Then you could break it out into a small function that appends the
string to the buffer for you and call it for each string:

static void append_string(char **dest, const char *src, size_t len)
{
strncpy(*dest, src, len);
*dest += len;
(*dest)[0] = '\0';
(*dest)++;
}

...
int __raise_fsevent(const char * oldname, const char * newname,
u32 mask)
{
...
name = event->name;
append_string(&name, current->comm, event->pname_len);
event->fname_len = strlen(oldname);
append_string(&name, oldname, event->fname_len);
...
}

> + memcpy(&msg->id, &cn_fs_event_id, sizeof(msg->id));
> + msg->ack = 0;
> + msg->len = sizeof(struct fsevent) + event->len;
> + cn_netlink_send(msg, CN_IDX_FS, GFP_KERNEL);
> + kfree(buffer);
> + return 0;
> +}
> +
> +void raise_fsevent(struct dentry * dentryp, u32 mask)
> +{
> + __raise_fsevent(dentryp->d_name.name, NULL, mask);
> +}
> +EXPORT_SYMBOL_GPL(raise_fsevent);
> +
> +void raise_fsevent_create(struct inode * inode, const char * name, u32 mask)
> +{
> + __raise_fsevent(name, NULL, mask);
> +}
> +EXPORT_SYMBOL_GPL(raise_fsevent_create);
> +
> +void raise_fsevent_move(struct inode * olddir, const char * oldname,
> + struct inode * newdir, const char * newname, u32 mask)
> +{
> + __raise_fsevent(oldname, newname, mask);
> +}
> +EXPORT_SYMBOL_GPL(raise_fsevent_move);
> +
> +static void cn_fs_event_ack(int err, int rcvd_seq, int rcvd_ack)
> +{
> + struct cn_msg *msg;
> + struct fsevent *event;
> + __u8 * buffer = NULL;
> +
> + if (atomic_read(&cn_fs_event_listeners) < 1)
> + return;
> +
> + if ((buffer = (__u8 *)kmalloc(CN_FS_MSG_SIZE, GFP_KERNEL)) == NULL) {

I don't think the __u8 cast is necessary

> + printk("cn_fs: out of memory\n");

missing printk tag

> + return;
> + }
> +
> + msg = (struct cn_msg*)buffer;
> + event = (struct fsevent*)msg->data;
> + msg->seq = rcvd_seq;
> + ktime_get_ts(&event->timestamp);
> + event->cpu = -1;
> + event->type = FSEVENT_CMDACK;
> + event->tgid = 0;
> + event->uid = 0;
> + event->gid = 0;
> + event->len = 0;
> + event->pname_len = 0;
> + event->fname_len = 0;
> + event->new_fname_len = 0;
> + event->err = err;
> + memcpy(&msg->id, &cn_fs_event_id, sizeof(msg->id));
> + msg->ack = rcvd_ack + 1;
> + msg->len = sizeof(struct fsevent);
> + cn_netlink_send(msg, CN_IDX_FS, GFP_KERNEL);
> +}
> +
> +static void cn_fsevent_mode_ctl(void *data)
> +{
> + struct cn_msg *msg = data;
> + enum fsevent_mode *mode = NULL;
> + int err = 0;
> +
> + if (msg->len != sizeof(*mode))
> + return;
> +
> + mode = (enum fsevent_mode*)msg->data;
> + switch (*mode) {
> + case FSEVENT_LISTEN:
> + atomic_inc(&cn_fs_event_listeners);
> + break;
> + case FSEVENT_IGNORE:
> + atomic_dec(&cn_fs_event_listeners);
> + break;
> + default:
> + err = EINVAL;
> + break;
> + }
> + cn_fs_event_ack(err, msg->seq, msg->ack);
> +}
> +
> +static int __init cn_fs_init(void)
> +{
> + int err;
> +
> + if ((err = cn_add_callback(&cn_fs_event_id, "cn_fs",
> + &cn_fsevent_mode_ctl))) {
> + printk(KERN_WARNING "cn_fs failed to register\n");
> + return err;
> + }
> + return 0;
> +}
> +
> +module_init(cn_fs_init);


2006-03-23 08:17:27

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [2.6.16 PATCH] Connector: Filesystem Events Connector

On Wed, 2006-03-22 at 22:58 +0800, Yi Yang wrote:
> This patch implements a new connector, Filesystem Event Connector,
> the user can monitor filesystem activities via it, currently, it
> can monitor access, attribute change, open, create, modify, delete,
> move and close of any file or directory.


how is this not redundant functionality with the audit subsystem ?


2006-03-23 08:53:09

by Evgeniy Polyakov

[permalink] [raw]
Subject: Re: [2.6.16 PATCH] Connector: Filesystem Events Connector

On Wed, Mar 22, 2006 at 11:43:28PM -0800, Matt Helsley ([email protected]) wrote:
> On Wed, 2006-03-22 at 22:58 +0800, Yi Yang wrote:
> > This patch implements a new connector, Filesystem Event Connector,
> > the user can monitor filesystem activities via it, currently, it
> > can monitor access, attribute change, open, create, modify, delete,
> > move and close of any file or directory.
> >
> > Every filesystem event will include tgid, uid and gid of the process
> > which triggered this event, process name, file or directory name
> > operated by it.
> >
> > Filesystem events connector is never a duplicate of inotify, inotify
> > just concerns change on file or directory, Beagle uses it to watch
> > file changes in order to regenerate index for it, inotify can't tell
> > us who did that change and what is its process name, but filesystem
> > events connector can do these, moreover inotify's overhead is greater
> > than filesystem events connector, inotify needs compare inode with
> > watched file or directories list to decide whether it should generate an
> > inotify_event, some locks also increase overhead, filesystem event
> > connector hasn't these overhead, it just generates a fsevent and send.
> >
> > To be important, filesystem event connector doesn't add any new system
> > call, the user space application can make use of it by netlink socket,
> > but inotify added several system calls, many events mechanism in kernel
> > have used netlink as communication way with user space, for example,
> > KOBJECT_UEVENT, PROC_EVENTS, to use netlink will make it more possible
> > to unify events interface to netlink, the user space application can use
> > it very easy.
> >
> > Signed-off-by: Yi Yang <[email protected]>

Ugh, I like the idea!

> > --- a/include/linux/connector.h.orig 2006-03-15 23:21:37.000000000 +0800
> > +++ b/include/linux/connector.h 2006-03-15 23:23:09.000000000 +0800
> > @@ -34,6 +34,8 @@
> > #define CN_VAL_PROC 0x1
> > #define CN_IDX_CIFS 0x2
> > #define CN_VAL_CIFS 0x1
> > +#define CN_IDX_FS 0x3
> > +#define CN_VAL_FS 0x1


Please add some on-line comment about what it is here.

> > #define CN_NETLINK_USERS 1


This must be increased each time new id is added.
Although connector code does allocation with reserve, better to not
exhaust it.
Please increase it to 3.

...

> > +/*
> > + * Userspace sends this enum to register with the kernel that it is listening
> > + * for events on the connector.
> > + */
> > +enum fsevent_mode {
> > + FSEVENT_LISTEN = 1,
> > + FSEVENT_IGNORE = 2
> > +};
> > +
>
> Process Events Connector uses this mechanism to avoid most of the event
> generation code if there are no listeners.
>
> Michael Kerrisk has privately suggested to me that this mechanism gives
> userspace too much rope with which to hang itself. I think it just gives
> userspace more rope.
>
> That said, perhaps we can shorten the rope by adding a connector
> function to quickly return a value indicating if a process in userspace
> is listening to messages sent by the kernel. Then connectors could use
> that function rather that reinvent the same mechanism.

Btw, current connector code performs check for listeners before it
allocates any skbs.
If there are no listenres -ESRCH is returned from cn_netlink_send().

...

> Pull the assignment out of the condition. You're not saving any space by
> putting it into the if () and it's harder to read. I don't think the
> __u8 cast is necessary..
>
> > + printk("cn_fs: out of memory\n");
>
> missing printk tag

Do not print such info at all.

> > +void raise_fsevent(struct dentry * dentryp, u32 mask)
> > +{
> > + __raise_fsevent(dentryp->d_name.name, NULL, mask);
> > +}
> > +EXPORT_SYMBOL_GPL(raise_fsevent);
> > +
> > +void raise_fsevent_create(struct inode * inode, const char * name, u32 mask)
> > +{
> > + __raise_fsevent(name, NULL, mask);
> > +}
> > +EXPORT_SYMBOL_GPL(raise_fsevent_create);
> > +
> > +void raise_fsevent_move(struct inode * olddir, const char * oldname,
> > + struct inode * newdir, const char * newname, u32 mask)
> > +{
> > + __raise_fsevent(oldname, newname, mask);
> > +}
> > +EXPORT_SYMBOL_GPL(raise_fsevent_move);

Are there external modules which might use it?

--
Evgeniy Polyakov

2006-03-23 22:04:58

by Yi Yang

[permalink] [raw]
Subject: Re: [2.6.16 PATCH] Connector: Filesystem Events Connector

Serge E. Hallyn wrote:
> Quoting Yi Yang ([email protected]):
>
>> +#ifdef __KERNEL__
>> +#ifdef CONFIG_FS_EVENTS
>> +extern void raise_fsevent(struct dentry * dentryp, u32 mask);
>> +extern void raise_fsevent_move(struct inode * olddir, const char * oldname,
>> + struct inode * newdir, const char * newname, u32 mask);
>> +extern void raise_fsevent_create(struct inode * inode,
>> + const char * name, u32 mask);
>> +#else
>> +static void raise_fsevent(struct dentry * dentryp, u32 mask);
>> +{}
>>
>
> Hmm, this compiles, if !CONFIG_FS_EVENTS?
>
Sorry for my mistake, thank for your care, I'll resend it to correct it,
thanks again.
>
> -serge
>
>

2006-03-24 00:44:12

by Yi Yang

[permalink] [raw]
Subject: Re: [2.6.16 PATCH] Connector: Filesystem Events Connector

Arjan van de Ven wrote:
> On Wed, 2006-03-22 at 22:58 +0800, Yi Yang wrote:
>
>> This patch implements a new connector, Filesystem Event Connector,
>> the user can monitor filesystem activities via it, currently, it
>> can monitor access, attribute change, open, create, modify, delete,
>> move and close of any file or directory.
>>
>
>
> how is this not redundant functionality with the audit subsystem ?
>
If you open Audit option, audit subsystem will audit all the syscalls,
so it adds big overhead for the whole system,
but Filesystem Event Connector just concerns those operations related to
the filesystem, so it has little overhead,
moreover, audit subsystem is a complicated function block, it not only
sends audit results to netlink interface, but also
send them to klog or syslog, so it will add big overhead.

I think you can look File Event Connector as subset of audit subsystem,
but File Event Connector is a very lightweight
subsystem.
>
>
>

2006-03-24 01:20:34

by Yi Yang

[permalink] [raw]
Subject: Re: [2.6.16 PATCH] Connector: Filesystem Events Connector

Matt Helsley wrote:

Thanks for Matt's careful review. I'll follow your advices to modify it
and new version will be released soon.
> On Wed, 2006-03-22 at 22:58 +0800, Yi Yang wrote:
>
>> This patch implements a new connector, Filesystem Event Connector,
>> the user can monitor filesystem activities via it, currently, it
>> can monitor access, attribute change, open, create, modify, delete,
>> move and close of any file or directory.
>>
>> Every filesystem event will include tgid, uid and gid of the process
>> which triggered this event, process name, file or directory name
>> operated by it.
>>
>> Filesystem events connector is never a duplicate of inotify, inotify
>> just concerns change on file or directory, Beagle uses it to watch
>> file changes in order to regenerate index for it, inotify can't tell
>> us who did that change and what is its process name, but filesystem
>> events connector can do these, moreover inotify's overhead is greater
>> than filesystem events connector, inotify needs compare inode with
>> watched file or directories list to decide whether it should generate an
>> inotify_event, some locks also increase overhead, filesystem event
>> connector hasn't these overhead, it just generates a fsevent and send.
>>
>> To be important, filesystem event connector doesn't add any new system
>> call, the user space application can make use of it by netlink socket,
>> but inotify added several system calls, many events mechanism in kernel
>> have used netlink as communication way with user space, for example,
>> KOBJECT_UEVENT, PROC_EVENTS, to use netlink will make it more possible
>> to unify events interface to netlink, the user space application can use
>> it very easy.
>>
>> Signed-off-by: Yi Yang <[email protected]>
>>
>> --- a/include/linux/connector.h.orig 2006-03-15 23:21:37.000000000 +0800
>> +++ b/include/linux/connector.h 2006-03-15 23:23:09.000000000 +0800
>> @@ -34,6 +34,8 @@
>> #define CN_VAL_PROC 0x1
>> #define CN_IDX_CIFS 0x2
>> #define CN_VAL_CIFS 0x1
>> +#define CN_IDX_FS 0x3
>> +#define CN_VAL_FS 0x1
>>
>> #define CN_NETLINK_USERS 1
>>
>> --- a/include/linux/fsnotify.h.orig 2006-01-03 11:21:10.000000000 +0800
>> +++ b/include/linux/fsnotify.h 2006-03-22 21:48:24.000000000 +0800
>> @@ -15,6 +15,7 @@
>>
>> #include <linux/dnotify.h>
>> #include <linux/inotify.h>
>> +#include <linux/fsevent.h>
>>
>> /*
>> * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
>> @@ -45,6 +46,8 @@ static inline void fsnotify_move(struct
>> if (source) {
>> inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL);
>> }
>> + raise_fsevent_move(old_dir, old_name, new_dir, new_name,
>> + FSEVENT_MOVE | (isdir?FSEVENT_ISDIR:0));
>> }
>>
>> /*
>> @@ -56,6 +59,8 @@ static inline void fsnotify_nameremove(s
>> isdir = IN_ISDIR;
>> dnotify_parent(dentry, DN_DELETE);
>> inotify_dentry_parent_queue_event(dentry, IN_DELETE|isdir, 0, dentry->d_name.name);
>> + raise_fsevent(dentry,
>> + FSEVENT_DELETE | (isdir?FSEVENT_ISDIR:0));
>> }
>>
>> /*
>> @@ -74,6 +79,7 @@ static inline void fsnotify_create(struc
>> {
>> inode_dir_notify(inode, DN_CREATE);
>> inotify_inode_queue_event(inode, IN_CREATE, 0, name);
>> + raise_fsevent_create(inode, name, FSEVENT_CREATE);
>> }
>>
>> /*
>> @@ -83,6 +89,8 @@ static inline void fsnotify_mkdir(struct
>> {
>> inode_dir_notify(inode, DN_CREATE);
>> inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, name);
>> + raise_fsevent_create(inode, name,
>> + FSEVENT_CREATE | FSEVENT_ISDIR);
>> }
>>
>> /*
>> @@ -99,6 +107,8 @@ static inline void fsnotify_access(struc
>> dnotify_parent(dentry, DN_ACCESS);
>> inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
>> inotify_inode_queue_event(inode, mask, 0, NULL);
>> + raise_fsevent(dentry, FSEVENT_ACCESS |
>> + ((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
>> }
>>
>> /*
>> @@ -115,6 +125,8 @@ static inline void fsnotify_modify(struc
>> dnotify_parent(dentry, DN_MODIFY);
>> inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
>> inotify_inode_queue_event(inode, mask, 0, NULL);
>> + raise_fsevent(dentry, FSEVENT_MODIFY |
>> + ((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
>> }
>>
>> /*
>> @@ -130,6 +142,9 @@ static inline void fsnotify_open(struct
>>
>> inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
>> inotify_inode_queue_event(inode, mask, 0, NULL);
>> + raise_fsevent(dentry, FSEVENT_OPEN |
>> + ((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
>> +
>> }
>>
>> /*
>> @@ -148,6 +163,8 @@ static inline void fsnotify_close(struct
>>
>> inotify_dentry_parent_queue_event(dentry, mask, 0, name);
>> inotify_inode_queue_event(inode, mask, 0, NULL);
>> + raise_fsevent(dentry, FSEVENT_CLOSE |
>> + ((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
>> }
>>
>> /*
>> @@ -163,6 +180,8 @@ static inline void fsnotify_xattr(struct
>>
>> inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
>> inotify_inode_queue_event(inode, mask, 0, NULL);
>> + raise_fsevent(dentry, FSEVENT_MODIFY_ATTRIB |
>> + ((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
>> }
>>
>> /*
>> @@ -213,6 +232,24 @@ static inline void fsnotify_change(struc
>> inotify_dentry_parent_queue_event(dentry, in_mask, 0,
>> dentry->d_name.name);
>> }
>> +
>> +#ifdef CONFIG_FS_EVENTS
>> + {
>> + u32 fsevent_mask = 0;
>> + if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE))
>> + fsevent_mask |= FSEVENT_MODIFY_ATTRIB;
>> + if ((ia_valid & ATTR_ATIME) && (ia_valid & ATTR_MTIME))
>> + fsevent_mask |= FSEVENT_MODIFY_ATTRIB;
>> + else if (ia_valid & ATTR_ATIME)
>> + fsevent_mask |= FSEVENT_ACCESS;
>> + else if (ia_valid & ATTR_MTIME)
>> + fsevent_mask |= FSEVENT_MODIFY;
>> + if (ia_valid & ATTR_SIZE)
>> + fsevent_mask |= FSEVENT_MODIFY;
>> + if (fsevent_mask)
>> + raise_fsevent(dentry, fsevent_mask);
>> + }
>> +#endif /* CONFIG_FS_EVENTS */
>> }
>>
>> #ifdef CONFIG_INOTIFY /* inotify helpers */
>> --- a/drivers/connector/Makefile.orig 2006-03-12 21:23:18.000000000 +0800
>> +++ b/drivers/connector/Makefile 2006-03-12 21:24:25.000000000 +0800
>> @@ -1,4 +1,5 @@
>> obj-$(CONFIG_CONNECTOR) += cn.o
>> obj-$(CONFIG_PROC_EVENTS) += cn_proc.o
>> +obj-$(CONFIG_FS_EVENTS) += cn_fs.o
>>
>> cn-y += cn_queue.o connector.o
>> --- a/drivers/connector/Kconfig.orig 2006-03-12 21:23:53.000000000 +0800
>> +++ b/drivers/connector/Kconfig 2006-03-12 21:21:44.000000000 +0800
>> @@ -18,4 +18,12 @@ config PROC_EVENTS
>> Provide a connector that reports process events to userspace. Send
>> events such as fork, exec, id change (uid, gid, suid, etc), and exit.
>>
>> +config FS_EVENTS
>> + boolean "Report filesystem events to userspace"
>> + depends on CONNECTOR=y
>> + default y
>> + ---help---
>> + Provide a connector that reports filesystem events to userspace. The
>> + reported event include open, read, write.
>> +
>> endmenu
>> --- /dev/null 2003-01-30 18:24:37.000000000 +0800
>> +++ b/include/linux/fsevent.h 2006-03-16 22:52:06.000000000 +0800
>> @@ -0,0 +1,86 @@
>> +/*
>> + * fsevent.h - filesystem events connector
>> + *
>> + * Copyright (C) 2006 Yi Yang <[email protected]>
>> + * Based on cn_proc.h by Matt Helsley, IBM Corp
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program; if not, write to the Free Software
>> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
>> + */
>> +
>> +#ifndef LINUX_FSEVENT_H
>> +#define LINUX_FSEVENT_H
>> +
>> +#include <linux/types.h>
>> +#include <linux/time.h>
>> +#include <linux/connector.h>
>> +
>> +enum fsevent_type {
>> + FSEVENT_ACCESS = 0x00000001, /* File was accessed */
>> + FSEVENT_MODIFY = 0x00000002, /* File was modified */
>> + FSEVENT_MODIFY_ATTRIB = 0x00000004, /* Metadata changed */
>> + FSEVENT_CLOSE = 0x00000008, /* File was closed */
>> + FSEVENT_OPEN = 0x00000010, /* File was opened */
>> + FSEVENT_MOVE = 0x00000020, /* File was moved */
>> + FSEVENT_CREATE = 0x00000040, /* File was created */
>> + FSEVENT_DELETE = 0x00000080, /* File was deleted */
>> + FSEVENT_CMDACK = 0x40000000, /* For used internally */
>>
>
> s/used/use/
>
>
>> + FSEVENT_ISDIR = 0x80000000 /* It is set for a dir */
>> +};
>>
>
> So we'd have DN_FOO, IN_FOO, and FSEVENT_FOO? Any chance inotify
> definitions could be used instead of redefining all of these? Perhaps
> that would simplify the code in fsnotify.h.
>
> The only problem I can see is the FSEVENT_CMDACK type. This could be
> avoided by breaking up the type field into an inotify-compatible mask
> field and the type field (EVENT or CMDACK).
>
This advice is very good, I'll use this kind of way.
>
>> +/*
>> + * Userspace sends this enum to register with the kernel that it is listening
>> + * for events on the connector.
>> + */
>> +enum fsevent_mode {
>> + FSEVENT_LISTEN = 1,
>> + FSEVENT_IGNORE = 2
>> +};
>> +
>>
>
> Process Events Connector uses this mechanism to avoid most of the event
> generation code if there are no listeners.
>
> Michael Kerrisk has privately suggested to me that this mechanism gives
> userspace too much rope with which to hang itself. I think it just gives
> userspace more rope.
>
> That said, perhaps we can shorten the rope by adding a connector
> function to quickly return a value indicating if a process in userspace
> is listening to messages sent by the kernel. Then connectors could use
> that function rather that reinvent the same mechanism.
>
After Evgeniy did it, I'll follow his way.
>
>> +struct fsevent {
>> + __u32 type;
>> + __u32 cpu;
>> + struct timespec timestamp;
>> + __u32 tgid;
>>
>
> I think pid_t would be more appropriate here instead of a __u32.
> Also a tid field of type pid_t would be consistent with the Process
> Events returned to userspace. If a userspace program ever wanted to
> relate the two sets of events the tid field could be important. That
> said it would be nice to know if any userspace programs are planning on
> using this.
>
I ever used pid_t, but in the user space, it has problem, size of pid_t
in kernel space different from in user space,
I don't know why, do you have an idea about it?
>
>> + __u32 uid;
>> + __u32 gid;
>> + __u32 err;
>>
>
> The err field appears to be unused when sending events back to
> userspace. I suggest reusing it for the event mask and using the type
> field to indicate whether the message is a CMDACK or simply an EVENT.
>
This is a good idea, I'll do so.
>
>> + __u32 len;
>> + __u32 pname_len;
>> + __u32 fname_len;
>> + __u32 new_fname_len;
>>
>
> I think one of these _len fields could be dropped and later calculated
> as:
>
> new_fname_len = len - (pname_len + fname_len);
>
>
>> + char name[0];
>> +};
>> +
>> +#ifdef __KERNEL__
>> +#ifdef CONFIG_FS_EVENTS
>> +extern void raise_fsevent(struct dentry * dentryp, u32 mask);
>> +extern void raise_fsevent_move(struct inode * olddir, const char * oldname,
>> + struct inode * newdir, const char * newname, u32 mask);
>> +extern void raise_fsevent_create(struct inode * inode,
>> + const char * name, u32 mask);
>> +#else
>> +static void raise_fsevent(struct dentry * dentryp, u32 mask);
>> +{}
>> +
>> +static void raise_fsevent_move(struct inode * olddir, const char * oldname,
>> + struct inode * newdir, const char * newname, u32 mask)
>> +{}
>> +
>> +static void raise_fsevent_create(struct inode * inode,
>> + const char * name, u32 mask)
>> +{}
>> +#endif /* CONFIG_FS_EVENTS */
>> +#endif /* __KERNEL__ */
>> +#endif /* LINUX_FSEVENT_H */
>> --- /dev/null 2003-01-30 18:24:37.000000000 +0800
>> +++ b/drivers/connector/cn_fs.c 2006-03-16 22:54:28.000000000 +0800
>> @@ -0,0 +1,200 @@
>> +/*
>> + * cn_fs.c - filesystem events connector
>> + *
>> + * Copyright (C) 2006 Yi Yang <[email protected]>
>> + * Based on cn_proc.c by Matt Helsley, IBM Corp
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program; if not, write to the Free Software
>> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
>> + */
>> +
>> +#include <linux/module.h>
>> +#include <linux/kernel.h>
>> +#include <linux/ktime.h>
>> +#include <linux/init.h>
>> +#include <linux/fs.h>
>> +#include <linux/fsevent.h>
>> +#include <asm/atomic.h>
>> +
>> +#define CN_FS_MSG_SIZE (sizeof(struct cn_msg) + sizeof(struct fsevent))
>> +
>> +static atomic_t cn_fs_event_listeners = ATOMIC_INIT(0);
>> +static struct cb_id cn_fs_event_id = { CN_IDX_FS, CN_VAL_FS };
>> +
>> +static DEFINE_PER_CPU(__u32, cn_fs_event_counts) = { 0 };
>> +static int fsevent_burst_limit = 100;
>> +static int fsevent_ratelimit = 5 * HZ;
>> +
>> +static inline void get_seq(__u32 *ts, int *cpu)
>> +{
>> + *ts = get_cpu_var(cn_fs_event_counts)++;
>> + *cpu = smp_processor_id();
>> + put_cpu_var(cn_fs_event_counts);
>> +}
>> +
>> +int __raise_fsevent(const char * oldname, const char * newname, u32 mask)
>>
>
> The return value of this function does not appear to be used.
>
If some modules want to use it to transfer file system events reliably,
the return value will be very valuble,
because the caller can retry the transfer until it successes.
>
>> +{
>> + struct cn_msg *msg;
>> + struct fsevent *event;
>> + __u8 * buffer;
>>
>
> While the contents of buffer are being sent to userspace eventually, the
> pointer is not, so I don't think __u8 is necessary or conventional.
>
I agree.
>
>> + int namelen = 0;
>> + static unsigned long last = 0;
>> + static int fsevent_sum = 0;
>>
>
> Yuck, static local variables. IMHO these should be globals. It would
> make the fact they aren't protected from concurrent access more obvious
> (see below).
>
Yes, they should be global.
>
>> + if (atomic_read(&cn_fs_event_listeners) < 1)
>> + return 0;
>> +
>> + if (jiffies - last <= fsevent_ratelimit) {
>> + if (fsevent_sum > fsevent_burst_limit) {
>> + return -1;
>>
>
> OK, so you're rate limiting the events. Shouldn't you still boost the
> sequence number so that userspace knows some events got dropped? Also
> perhaps you can find an appropriate error to return instead of -1.
>
Good idea.
>
>> + }
>>
>
> remove unecessary braces
>
>
>> + fsevent_sum++;
>>
>
> Looks racy to me -- what's protecting fsevent_sum from access by
> multiple cpus?
>
This just is used to limit event rate when the user space application
leads to an unlimited events loop.
so it mustn't be precise, I used spinlock originally, but Andrew thinks
lock overhead is big, inotify has led to
some frustrating lock overhead, so I decide to remove it, in fact
fsevent_sum just is the number used to limit rate,
some race conditions don't effect the rate limit.
>
>> + }
>> + else {
>>
>
> I seem to recall that coding style would have you move the else up.
>
>
>> + last = jiffies;
>> + fsevent_sum = 0;
>> + }
>> +
>> + namelen = strlen(current->comm) + strlen(oldname) + 2;
>> + if (newname) {
>> + namelen += strlen(newname) + 1;
>> + }
>>
>
> remove braces
>
>
>> + if ((buffer = (__u8 *)kmalloc(CN_FS_MSG_SIZE + namelen, GFP_KERNEL))
>> + == NULL) {
>>
>
> Pull the assignment out of the condition. You're not saving any space by
> putting it into the if () and it's harder to read. I don't think the
> __u8 cast is necessary..
>
I'll do so.
>
>> + printk("cn_fs: out of memory\n");
>>
>
> missing printk tag
>
>
>> + return -1;
>>
>
> Perhaps:
> return -ENOMEM;
>
>
>> + }
>> +
>> + msg = (struct cn_msg*)buffer;
>> + event = (struct fsevent*)msg->data;
>> + get_seq(&msg->seq, &event->cpu);
>> + ktime_get_ts(&event->timestamp);
>> + event->type = mask;
>> + event->tgid = current->tgid;
>> + event->uid = current->uid;
>> + event->gid = current->gid;
>> + event->pname_len = strlen(current->comm);
>> + memcpy(event->name, current->comm, event->pname_len);
>> + event->name[event->pname_len] = '\0';
>> + event->fname_len = strlen(oldname);
>> + memcpy(event->name + event->pname_len + 1, oldname, strlen(oldname));
>> + event->len = event->pname_len + event->fname_len + 2;
>> + event->name[event->len-1] = '\0';
>> + event->new_fname_len = 0;
>> + if (newname) {
>> + event->new_fname_len = strlen(newname);
>> + memcpy(event->name + event->len, newname, strlen(newname));
>> + event->len += event->new_fname_len + 1;
>> + event->name[event->len-1] = '\0';
>> + }
>>
>
>
> Maybe it's just my personal taste, but you could use pointer arithmetic
> while copying the strings into the event buffer:
>
> name = event->name;
>
> strncpy(name, current->comm, event->pname_len);
> name += event->pname_len;
> name[0] = '\0';
> name++;
>
> strncpy(name, oldname, event->fname_len);
> name += event->fname_len;
> name[0] = '\0';
> name++;
> ...
>
> Then you could break it out into a small function that appends the
> string to the buffer for you and call it for each string:
>
> static void append_string(char **dest, const char *src, size_t len)
> {
> strncpy(*dest, src, len);
> *dest += len;
> (*dest)[0] = '\0';
> (*dest)++;
> }
>
> ...
> int __raise_fsevent(const char * oldname, const char * newname,
> u32 mask)
> {
> ...
> name = event->name;
> append_string(&name, current->comm, event->pname_len);
> event->fname_len = strlen(oldname);
> append_string(&name, oldname, event->fname_len);
> ...
> }
>
>
>> + memcpy(&msg->id, &cn_fs_event_id, sizeof(msg->id));
>> + msg->ack = 0;
>> + msg->len = sizeof(struct fsevent) + event->len;
>> + cn_netlink_send(msg, CN_IDX_FS, GFP_KERNEL);
>> + kfree(buffer);
>> + return 0;
>> +}
>> +
>> +void raise_fsevent(struct dentry * dentryp, u32 mask)
>> +{
>> + __raise_fsevent(dentryp->d_name.name, NULL, mask);
>> +}
>> +EXPORT_SYMBOL_GPL(raise_fsevent);
>> +
>> +void raise_fsevent_create(struct inode * inode, const char * name, u32 mask)
>> +{
>> + __raise_fsevent(name, NULL, mask);
>> +}
>> +EXPORT_SYMBOL_GPL(raise_fsevent_create);
>> +
>> +void raise_fsevent_move(struct inode * olddir, const char * oldname,
>> + struct inode * newdir, const char * newname, u32 mask)
>> +{
>> + __raise_fsevent(oldname, newname, mask);
>> +}
>> +EXPORT_SYMBOL_GPL(raise_fsevent_move);
>> +
>> +static void cn_fs_event_ack(int err, int rcvd_seq, int rcvd_ack)
>> +{
>> + struct cn_msg *msg;
>> + struct fsevent *event;
>> + __u8 * buffer = NULL;
>> +
>> + if (atomic_read(&cn_fs_event_listeners) < 1)
>> + return;
>> +
>> + if ((buffer = (__u8 *)kmalloc(CN_FS_MSG_SIZE, GFP_KERNEL)) == NULL) {
>>
>
> I don't think the __u8 cast is necessary
>
>
>> + printk("cn_fs: out of memory\n");
>>
>
> missing printk tag
>
>
>> + return;
>> + }
>> +
>> + msg = (struct cn_msg*)buffer;
>> + event = (struct fsevent*)msg->data;
>> + msg->seq = rcvd_seq;
>> + ktime_get_ts(&event->timestamp);
>> + event->cpu = -1;
>> + event->type = FSEVENT_CMDACK;
>> + event->tgid = 0;
>> + event->uid = 0;
>> + event->gid = 0;
>> + event->len = 0;
>> + event->pname_len = 0;
>> + event->fname_len = 0;
>> + event->new_fname_len = 0;
>> + event->err = err;
>> + memcpy(&msg->id, &cn_fs_event_id, sizeof(msg->id));
>> + msg->ack = rcvd_ack + 1;
>> + msg->len = sizeof(struct fsevent);
>> + cn_netlink_send(msg, CN_IDX_FS, GFP_KERNEL);
>> +}
>> +
>> +static void cn_fsevent_mode_ctl(void *data)
>> +{
>> + struct cn_msg *msg = data;
>> + enum fsevent_mode *mode = NULL;
>> + int err = 0;
>> +
>> + if (msg->len != sizeof(*mode))
>> + return;
>> +
>> + mode = (enum fsevent_mode*)msg->data;
>> + switch (*mode) {
>> + case FSEVENT_LISTEN:
>> + atomic_inc(&cn_fs_event_listeners);
>> + break;
>> + case FSEVENT_IGNORE:
>> + atomic_dec(&cn_fs_event_listeners);
>> + break;
>> + default:
>> + err = EINVAL;
>> + break;
>> + }
>> + cn_fs_event_ack(err, msg->seq, msg->ack);
>> +}
>> +
>> +static int __init cn_fs_init(void)
>> +{
>> + int err;
>> +
>> + if ((err = cn_add_callback(&cn_fs_event_id, "cn_fs",
>> + &cn_fsevent_mode_ctl))) {
>> + printk(KERN_WARNING "cn_fs failed to register\n");
>> + return err;
>> + }
>> + return 0;
>> +}
>> +
>> +module_init(cn_fs_init);
>>
>
>
>
>

2006-03-24 01:24:55

by Yi Yang

[permalink] [raw]
Subject: Re: [2.6.16 PATCH] Connector: Filesystem Events Connector

Evgeniy Polyakov wrote:
> On Wed, Mar 22, 2006 at 11:43:28PM -0800, Matt Helsley ([email protected]) wrote:
>
>> On Wed, 2006-03-22 at 22:58 +0800, Yi Yang wrote:
>>
>>> This patch implements a new connector, Filesystem Event Connector,
>>> the user can monitor filesystem activities via it, currently, it
>>> can monitor access, attribute change, open, create, modify, delete,
>>> move and close of any file or directory.
>>>
>>> Every filesystem event will include tgid, uid and gid of the process
>>> which triggered this event, process name, file or directory name
>>> operated by it.
>>>
>>> Filesystem events connector is never a duplicate of inotify, inotify
>>> just concerns change on file or directory, Beagle uses it to watch
>>> file changes in order to regenerate index for it, inotify can't tell
>>> us who did that change and what is its process name, but filesystem
>>> events connector can do these, moreover inotify's overhead is greater
>>> than filesystem events connector, inotify needs compare inode with
>>> watched file or directories list to decide whether it should generate an
>>> inotify_event, some locks also increase overhead, filesystem event
>>> connector hasn't these overhead, it just generates a fsevent and send.
>>>
>>> To be important, filesystem event connector doesn't add any new system
>>> call, the user space application can make use of it by netlink socket,
>>> but inotify added several system calls, many events mechanism in kernel
>>> have used netlink as communication way with user space, for example,
>>> KOBJECT_UEVENT, PROC_EVENTS, to use netlink will make it more possible
>>> to unify events interface to netlink, the user space application can use
>>> it very easy.
>>>
>>> Signed-off-by: Yi Yang <[email protected]>
>>>
>
> Ugh, I like the idea!
>
>
>>> --- a/include/linux/connector.h.orig 2006-03-15 23:21:37.000000000 +0800
>>> +++ b/include/linux/connector.h 2006-03-15 23:23:09.000000000 +0800
>>> @@ -34,6 +34,8 @@
>>> #define CN_VAL_PROC 0x1
>>> #define CN_IDX_CIFS 0x2
>>> #define CN_VAL_CIFS 0x1
>>> +#define CN_IDX_FS 0x3
>>> +#define CN_VAL_FS 0x1
>>>
>
>
> Please add some on-line comment about what it is here.
>
OK.
>
>>> #define CN_NETLINK_USERS 1
>>>
>
>
> This must be increased each time new id is added.
> Although connector code does allocation with reserve, better to not
> exhaust it.
> Please increase it to 3.
>
> ...
>
>
I'll do it.
>>> +/*
>>> + * Userspace sends this enum to register with the kernel that it is listening
>>> + * for events on the connector.
>>> + */
>>> +enum fsevent_mode {
>>> + FSEVENT_LISTEN = 1,
>>> + FSEVENT_IGNORE = 2
>>> +};
>>> +
>>>
>> Process Events Connector uses this mechanism to avoid most of the event
>> generation code if there are no listeners.
>>
>> Michael Kerrisk has privately suggested to me that this mechanism gives
>> userspace too much rope with which to hang itself. I think it just gives
>> userspace more rope.
>>
>> That said, perhaps we can shorten the rope by adding a connector
>> function to quickly return a value indicating if a process in userspace
>> is listening to messages sent by the kernel. Then connectors could use
>> that function rather that reinvent the same mechanism.
>>
>
> Btw, current connector code performs check for listeners before it
> allocates any skbs.
> If there are no listenres -ESRCH is returned from cn_netlink_send().
>
> ...
>
>
>> Pull the assignment out of the condition. You're not saving any space by
>> putting it into the if () and it's harder to read. I don't think the
>> __u8 cast is necessary..
>>
>>
>>> + printk("cn_fs: out of memory\n");
>>>
>> missing printk tag
>>
>
> Do not print such info at all.
>
OK.
>
>>> +void raise_fsevent(struct dentry * dentryp, u32 mask)
>>> +{
>>> + __raise_fsevent(dentryp->d_name.name, NULL, mask);
>>> +}
>>> +EXPORT_SYMBOL_GPL(raise_fsevent);
>>> +
>>> +void raise_fsevent_create(struct inode * inode, const char * name, u32 mask)
>>> +{
>>> + __raise_fsevent(name, NULL, mask);
>>> +}
>>> +EXPORT_SYMBOL_GPL(raise_fsevent_create);
>>> +
>>> +void raise_fsevent_move(struct inode * olddir, const char * oldname,
>>> + struct inode * newdir, const char * newname, u32 mask)
>>> +{
>>> + __raise_fsevent(oldname, newname, mask);
>>> +}
>>> +EXPORT_SYMBOL_GPL(raise_fsevent_move);
>>>
>
> Are there external modules which might use it?
>
Yes, nfs will use it indirectly, because fsnotify is a common file
system code path. if nfs is configured into module, compiler will complain
.

2006-03-24 02:49:09

by Yi Yang

[permalink] [raw]
Subject: Re: [2.6.16 PATCH] Connector: Filesystem Events Connector

Matt Helsley wrote:
> On Wed, 2006-03-22 at 22:58 +0800, Yi Yang wrote:
>
>> This patch implements a new connector, Filesystem Event Connector,
>> the user can monitor filesystem activities via it, currently, it
>> can monitor access, attribute change, open, create, modify, delete,
>> move and close of any file or directory.
>>
>> Every filesystem event will include tgid, uid and gid of the process
>> which triggered this event, process name, file or directory name
>> operated by it.
>>
>> Filesystem events connector is never a duplicate of inotify, inotify
>> just concerns change on file or directory, Beagle uses it to watch
>> file changes in order to regenerate index for it, inotify can't tell
>> us who did that change and what is its process name, but filesystem
>> events connector can do these, moreover inotify's overhead is greater
>> than filesystem events connector, inotify needs compare inode with
>> watched file or directories list to decide whether it should generate an
>> inotify_event, some locks also increase overhead, filesystem event
>> connector hasn't these overhead, it just generates a fsevent and send.
>>
>> To be important, filesystem event connector doesn't add any new system
>> call, the user space application can make use of it by netlink socket,
>> but inotify added several system calls, many events mechanism in kernel
>> have used netlink as communication way with user space, for example,
>> KOBJECT_UEVENT, PROC_EVENTS, to use netlink will make it more possible
>> to unify events interface to netlink, the user space application can use
>> it very easy.
>>
>> Signed-off-by: Yi Yang <[email protected]>
>>
>> --- a/include/linux/connector.h.orig 2006-03-15 23:21:37.000000000 +0800
>> +++ b/include/linux/connector.h 2006-03-15 23:23:09.000000000 +0800
>> @@ -34,6 +34,8 @@
>> #define CN_VAL_PROC 0x1
>> #define CN_IDX_CIFS 0x2
>> #define CN_VAL_CIFS 0x1
>> +#define CN_IDX_FS 0x3
>> +#define CN_VAL_FS 0x1
>>
>> #define CN_NETLINK_USERS 1
>>
>> --- a/include/linux/fsnotify.h.orig 2006-01-03 11:21:10.000000000 +0800
>> +++ b/include/linux/fsnotify.h 2006-03-22 21:48:24.000000000 +0800
>> @@ -15,6 +15,7 @@
>>
>> #include <linux/dnotify.h>
>> #include <linux/inotify.h>
>> +#include <linux/fsevent.h>
>>
>> /*
>> * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
>> @@ -45,6 +46,8 @@ static inline void fsnotify_move(struct
>> if (source) {
>> inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL);
>> }
>> + raise_fsevent_move(old_dir, old_name, new_dir, new_name,
>> + FSEVENT_MOVE | (isdir?FSEVENT_ISDIR:0));
>> }
>>
>> /*
>> @@ -56,6 +59,8 @@ static inline void fsnotify_nameremove(s
>> isdir = IN_ISDIR;
>> dnotify_parent(dentry, DN_DELETE);
>> inotify_dentry_parent_queue_event(dentry, IN_DELETE|isdir, 0, dentry->d_name.name);
>> + raise_fsevent(dentry,
>> + FSEVENT_DELETE | (isdir?FSEVENT_ISDIR:0));
>> }
>>
>> /*
>> @@ -74,6 +79,7 @@ static inline void fsnotify_create(struc
>> {
>> inode_dir_notify(inode, DN_CREATE);
>> inotify_inode_queue_event(inode, IN_CREATE, 0, name);
>> + raise_fsevent_create(inode, name, FSEVENT_CREATE);
>> }
>>
>> /*
>> @@ -83,6 +89,8 @@ static inline void fsnotify_mkdir(struct
>> {
>> inode_dir_notify(inode, DN_CREATE);
>> inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, name);
>> + raise_fsevent_create(inode, name,
>> + FSEVENT_CREATE | FSEVENT_ISDIR);
>> }
>>
>> /*
>> @@ -99,6 +107,8 @@ static inline void fsnotify_access(struc
>> dnotify_parent(dentry, DN_ACCESS);
>> inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
>> inotify_inode_queue_event(inode, mask, 0, NULL);
>> + raise_fsevent(dentry, FSEVENT_ACCESS |
>> + ((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
>> }
>>
>> /*
>> @@ -115,6 +125,8 @@ static inline void fsnotify_modify(struc
>> dnotify_parent(dentry, DN_MODIFY);
>> inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
>> inotify_inode_queue_event(inode, mask, 0, NULL);
>> + raise_fsevent(dentry, FSEVENT_MODIFY |
>> + ((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
>> }
>>
>> /*
>> @@ -130,6 +142,9 @@ static inline void fsnotify_open(struct
>>
>> inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
>> inotify_inode_queue_event(inode, mask, 0, NULL);
>> + raise_fsevent(dentry, FSEVENT_OPEN |
>> + ((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
>> +
>> }
>>
>> /*
>> @@ -148,6 +163,8 @@ static inline void fsnotify_close(struct
>>
>> inotify_dentry_parent_queue_event(dentry, mask, 0, name);
>> inotify_inode_queue_event(inode, mask, 0, NULL);
>> + raise_fsevent(dentry, FSEVENT_CLOSE |
>> + ((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
>> }
>>
>> /*
>> @@ -163,6 +180,8 @@ static inline void fsnotify_xattr(struct
>>
>> inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
>> inotify_inode_queue_event(inode, mask, 0, NULL);
>> + raise_fsevent(dentry, FSEVENT_MODIFY_ATTRIB |
>> + ((S_ISDIR(inode->i_mode))?FSEVENT_ISDIR:0));
>> }
>>
>> /*
>> @@ -213,6 +232,24 @@ static inline void fsnotify_change(struc
>> inotify_dentry_parent_queue_event(dentry, in_mask, 0,
>> dentry->d_name.name);
>> }
>> +
>> +#ifdef CONFIG_FS_EVENTS
>> + {
>> + u32 fsevent_mask = 0;
>> + if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE))
>> + fsevent_mask |= FSEVENT_MODIFY_ATTRIB;
>> + if ((ia_valid & ATTR_ATIME) && (ia_valid & ATTR_MTIME))
>> + fsevent_mask |= FSEVENT_MODIFY_ATTRIB;
>> + else if (ia_valid & ATTR_ATIME)
>> + fsevent_mask |= FSEVENT_ACCESS;
>> + else if (ia_valid & ATTR_MTIME)
>> + fsevent_mask |= FSEVENT_MODIFY;
>> + if (ia_valid & ATTR_SIZE)
>> + fsevent_mask |= FSEVENT_MODIFY;
>> + if (fsevent_mask)
>> + raise_fsevent(dentry, fsevent_mask);
>> + }
>> +#endif /* CONFIG_FS_EVENTS */
>> }
>>
>> #ifdef CONFIG_INOTIFY /* inotify helpers */
>> --- a/drivers/connector/Makefile.orig 2006-03-12 21:23:18.000000000 +0800
>> +++ b/drivers/connector/Makefile 2006-03-12 21:24:25.000000000 +0800
>> @@ -1,4 +1,5 @@
>> obj-$(CONFIG_CONNECTOR) += cn.o
>> obj-$(CONFIG_PROC_EVENTS) += cn_proc.o
>> +obj-$(CONFIG_FS_EVENTS) += cn_fs.o
>>
>> cn-y += cn_queue.o connector.o
>> --- a/drivers/connector/Kconfig.orig 2006-03-12 21:23:53.000000000 +0800
>> +++ b/drivers/connector/Kconfig 2006-03-12 21:21:44.000000000 +0800
>> @@ -18,4 +18,12 @@ config PROC_EVENTS
>> Provide a connector that reports process events to userspace. Send
>> events such as fork, exec, id change (uid, gid, suid, etc), and exit.
>>
>> +config FS_EVENTS
>> + boolean "Report filesystem events to userspace"
>> + depends on CONNECTOR=y
>> + default y
>> + ---help---
>> + Provide a connector that reports filesystem events to userspace. The
>> + reported event include open, read, write.
>> +
>> endmenu
>> --- /dev/null 2003-01-30 18:24:37.000000000 +0800
>> +++ b/include/linux/fsevent.h 2006-03-16 22:52:06.000000000 +0800
>> @@ -0,0 +1,86 @@
>> +/*
>> + * fsevent.h - filesystem events connector
>> + *
>> + * Copyright (C) 2006 Yi Yang <[email protected]>
>> + * Based on cn_proc.h by Matt Helsley, IBM Corp
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program; if not, write to the Free Software
>> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
>> + */
>> +
>> +#ifndef LINUX_FSEVENT_H
>> +#define LINUX_FSEVENT_H
>> +
>> +#include <linux/types.h>
>> +#include <linux/time.h>
>> +#include <linux/connector.h>
>> +
>> +enum fsevent_type {
>> + FSEVENT_ACCESS = 0x00000001, /* File was accessed */
>> + FSEVENT_MODIFY = 0x00000002, /* File was modified */
>> + FSEVENT_MODIFY_ATTRIB = 0x00000004, /* Metadata changed */
>> + FSEVENT_CLOSE = 0x00000008, /* File was closed */
>> + FSEVENT_OPEN = 0x00000010, /* File was opened */
>> + FSEVENT_MOVE = 0x00000020, /* File was moved */
>> + FSEVENT_CREATE = 0x00000040, /* File was created */
>> + FSEVENT_DELETE = 0x00000080, /* File was deleted */
>> + FSEVENT_CMDACK = 0x40000000, /* For used internally */
>>
>
> s/used/use/
>
>
>> + FSEVENT_ISDIR = 0x80000000 /* It is set for a dir */
>> +};
>>
>
> So we'd have DN_FOO, IN_FOO, and FSEVENT_FOO? Any chance inotify
> definitions could be used instead of redefining all of these? Perhaps
> that would simplify the code in fsnotify.h.
>
> The only problem I can see is the FSEVENT_CMDACK type. This could be
> avoided by breaking up the type field into an inotify-compatible mask
> field and the type field (EVENT or CMDACK).
>
After I considered them carefully, I think it is good to use new macro
definition, because they don't
correspond to fsnotify or dnotify one to one.
>
>> +/*
>> + * Userspace sends this enum to register with the kernel that it is listening
>> + * for events on the connector.
>> + */
>> +enum fsevent_mode {
>> + FSEVENT_LISTEN = 1,
>> + FSEVENT_IGNORE = 2
>> +};
>> +
>>
>
> Process Events Connector uses this mechanism to avoid most of the event
> generation code if there are no listeners.
>
> Michael Kerrisk has privately suggested to me that this mechanism gives
> userspace too much rope with which to hang itself. I think it just gives
> userspace more rope.
>
> That said, perhaps we can shorten the rope by adding a connector
> function to quickly return a value indicating if a process in userspace
> is listening to messages sent by the kernel. Then connectors could use
> that function rather that reinvent the same mechanism.
>
>
>> +struct fsevent {
>> + __u32 type;
>> + __u32 cpu;
>> + struct timespec timestamp;
>> + __u32 tgid;
>>
>
> I think pid_t would be more appropriate here instead of a __u32.
> Also a tid field of type pid_t would be consistent with the Process
> Events returned to userspace. If a userspace program ever wanted to
> relate the two sets of events the tid field could be important. That
> said it would be nice to know if any userspace programs are planning on
> using this.
>

I think size of pid_t must be changed from 2.4 to 2.6, in Redhat 9.0, I
find size of pid_t in user space is
different from kernel space, I believe that is the problem of stale
header files.
>
>> + __u32 uid;
>> + __u32 gid;
>> + __u32 err;
>>
>
> The err field appears to be unused when sending events back to
> userspace. I suggest reusing it for the event mask and using the type
> field to indicate whether the message is a CMDACK or simply an EVENT.
>
err is useful when you use unknown CMD for the connector, ACK will tell
the user CMD is invalid.
>
>> + __u32 len;
>> + __u32 pname_len;
>> + __u32 fname_len;
>> + __u32 new_fname_len;
>>
>
> I think one of these _len fields could be dropped and later calculated
> as:
>
> new_fname_len = len - (pname_len + fname_len);
>
>
>> + char name[0];
>> +};
>> +
>> +#ifdef __KERNEL__
>> +#ifdef CONFIG_FS_EVENTS
>> +extern void raise_fsevent(struct dentry * dentryp, u32 mask);
>> +extern void raise_fsevent_move(struct inode * olddir, const char * oldname,
>> + struct inode * newdir, const char * newname, u32 mask);
>> +extern void raise_fsevent_create(struct inode * inode,
>> + const char * name, u32 mask);
>> +#else
>> +static void raise_fsevent(struct dentry * dentryp, u32 mask);
>> +{}
>> +
>> +static void raise_fsevent_move(struct inode * olddir, const char * oldname,
>> + struct inode * newdir, const char * newname, u32 mask)
>> +{}
>> +
>> +static void raise_fsevent_create(struct inode * inode,
>> + const char * name, u32 mask)
>> +{}
>> +#endif /* CONFIG_FS_EVENTS */
>> +#endif /* __KERNEL__ */
>> +#endif /* LINUX_FSEVENT_H */
>> --- /dev/null 2003-01-30 18:24:37.000000000 +0800
>> +++ b/drivers/connector/cn_fs.c 2006-03-16 22:54:28.000000000 +0800
>> @@ -0,0 +1,200 @@
>> +/*
>> + * cn_fs.c - filesystem events connector
>> + *
>> + * Copyright (C) 2006 Yi Yang <[email protected]>
>> + * Based on cn_proc.c by Matt Helsley, IBM Corp
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program; if not, write to the Free Software
>> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
>> + */
>> +
>> +#include <linux/module.h>
>> +#include <linux/kernel.h>
>> +#include <linux/ktime.h>
>> +#include <linux/init.h>
>> +#include <linux/fs.h>
>> +#include <linux/fsevent.h>
>> +#include <asm/atomic.h>
>> +
>> +#define CN_FS_MSG_SIZE (sizeof(struct cn_msg) + sizeof(struct fsevent))
>> +
>> +static atomic_t cn_fs_event_listeners = ATOMIC_INIT(0);
>> +static struct cb_id cn_fs_event_id = { CN_IDX_FS, CN_VAL_FS };
>> +
>> +static DEFINE_PER_CPU(__u32, cn_fs_event_counts) = { 0 };
>> +static int fsevent_burst_limit = 100;
>> +static int fsevent_ratelimit = 5 * HZ;
>> +
>> +static inline void get_seq(__u32 *ts, int *cpu)
>> +{
>> + *ts = get_cpu_var(cn_fs_event_counts)++;
>> + *cpu = smp_processor_id();
>> + put_cpu_var(cn_fs_event_counts);
>> +}
>> +
>> +int __raise_fsevent(const char * oldname, const char * newname, u32 mask)
>>
>
> The return value of this function does not appear to be used.
>
>
>> +{
>> + struct cn_msg *msg;
>> + struct fsevent *event;
>> + __u8 * buffer;
>>
>
> While the contents of buffer are being sent to userspace eventually, the
> pointer is not, so I don't think __u8 is necessary or conventional.
>
>
>> + int namelen = 0;
>> + static unsigned long last = 0;
>> + static int fsevent_sum = 0;
>>
>
> Yuck, static local variables. IMHO these should be globals. It would
> make the fact they aren't protected from concurrent access more obvious
> (see below).
>
>
>> + if (atomic_read(&cn_fs_event_listeners) < 1)
>> + return 0;
>> +
>> + if (jiffies - last <= fsevent_ratelimit) {
>> + if (fsevent_sum > fsevent_burst_limit) {
>> + return -1;
>>
>
> OK, so you're rate limiting the events. Shouldn't you still boost the
> sequence number so that userspace knows some events got dropped? Also
> perhaps you can find an appropriate error to return instead of -1.
>
>
>> + }
>>
>
> remove unecessary braces
>
>
>> + fsevent_sum++;
>>
>
> Looks racy to me -- what's protecting fsevent_sum from access by
> multiple cpus?
>
>
>> + }
>> + else {
>>
>
> I seem to recall that coding style would have you move the else up.
>
>
>> + last = jiffies;
>> + fsevent_sum = 0;
>> + }
>> +
>> + namelen = strlen(current->comm) + strlen(oldname) + 2;
>> + if (newname) {
>> + namelen += strlen(newname) + 1;
>> + }
>>
>
> remove braces
>
>
>> + if ((buffer = (__u8 *)kmalloc(CN_FS_MSG_SIZE + namelen, GFP_KERNEL))
>> + == NULL) {
>>
>
> Pull the assignment out of the condition. You're not saving any space by
> putting it into the if () and it's harder to read. I don't think the
> __u8 cast is necessary..
>
>
>> + printk("cn_fs: out of memory\n");
>>
>
> missing printk tag
>
>
>> + return -1;
>>
>
> Perhaps:
> return -ENOMEM;
>
>
>> + }
>> +
>> + msg = (struct cn_msg*)buffer;
>> + event = (struct fsevent*)msg->data;
>> + get_seq(&msg->seq, &event->cpu);
>> + ktime_get_ts(&event->timestamp);
>> + event->type = mask;
>> + event->tgid = current->tgid;
>> + event->uid = current->uid;
>> + event->gid = current->gid;
>> + event->pname_len = strlen(current->comm);
>> + memcpy(event->name, current->comm, event->pname_len);
>> + event->name[event->pname_len] = '\0';
>> + event->fname_len = strlen(oldname);
>> + memcpy(event->name + event->pname_len + 1, oldname, strlen(oldname));
>> + event->len = event->pname_len + event->fname_len + 2;
>> + event->name[event->len-1] = '\0';
>> + event->new_fname_len = 0;
>> + if (newname) {
>> + event->new_fname_len = strlen(newname);
>> + memcpy(event->name + event->len, newname, strlen(newname));
>> + event->len += event->new_fname_len + 1;
>> + event->name[event->len-1] = '\0';
>> + }
>>
>
>
> Maybe it's just my personal taste, but you could use pointer arithmetic
> while copying the strings into the event buffer:
>
> name = event->name;
>
> strncpy(name, current->comm, event->pname_len);
> name += event->pname_len;
> name[0] = '\0';
> name++;
>
> strncpy(name, oldname, event->fname_len);
> name += event->fname_len;
> name[0] = '\0';
> name++;
> ...
>
> Then you could break it out into a small function that appends the
> string to the buffer for you and call it for each string:
>
> static void append_string(char **dest, const char *src, size_t len)
> {
> strncpy(*dest, src, len);
> *dest += len;
> (*dest)[0] = '\0';
> (*dest)++;
> }
>
> ...
> int __raise_fsevent(const char * oldname, const char * newname,
> u32 mask)
> {
> ...
> name = event->name;
> append_string(&name, current->comm, event->pname_len);
> event->fname_len = strlen(oldname);
> append_string(&name, oldname, event->fname_len);
> ...
> }
>
>
>> + memcpy(&msg->id, &cn_fs_event_id, sizeof(msg->id));
>> + msg->ack = 0;
>> + msg->len = sizeof(struct fsevent) + event->len;
>> + cn_netlink_send(msg, CN_IDX_FS, GFP_KERNEL);
>> + kfree(buffer);
>> + return 0;
>> +}
>> +
>> +void raise_fsevent(struct dentry * dentryp, u32 mask)
>> +{
>> + __raise_fsevent(dentryp->d_name.name, NULL, mask);
>> +}
>> +EXPORT_SYMBOL_GPL(raise_fsevent);
>> +
>> +void raise_fsevent_create(struct inode * inode, const char * name, u32 mask)
>> +{
>> + __raise_fsevent(name, NULL, mask);
>> +}
>> +EXPORT_SYMBOL_GPL(raise_fsevent_create);
>> +
>> +void raise_fsevent_move(struct inode * olddir, const char * oldname,
>> + struct inode * newdir, const char * newname, u32 mask)
>> +{
>> + __raise_fsevent(oldname, newname, mask);
>> +}
>> +EXPORT_SYMBOL_GPL(raise_fsevent_move);
>> +
>> +static void cn_fs_event_ack(int err, int rcvd_seq, int rcvd_ack)
>> +{
>> + struct cn_msg *msg;
>> + struct fsevent *event;
>> + __u8 * buffer = NULL;
>> +
>> + if (atomic_read(&cn_fs_event_listeners) < 1)
>> + return;
>> +
>> + if ((buffer = (__u8 *)kmalloc(CN_FS_MSG_SIZE, GFP_KERNEL)) == NULL) {
>>
>
> I don't think the __u8 cast is necessary
>
>
>> + printk("cn_fs: out of memory\n");
>>
>
> missing printk tag
>
>
>> + return;
>> + }
>> +
>> + msg = (struct cn_msg*)buffer;
>> + event = (struct fsevent*)msg->data;
>> + msg->seq = rcvd_seq;
>> + ktime_get_ts(&event->timestamp);
>> + event->cpu = -1;
>> + event->type = FSEVENT_CMDACK;
>> + event->tgid = 0;
>> + event->uid = 0;
>> + event->gid = 0;
>> + event->len = 0;
>> + event->pname_len = 0;
>> + event->fname_len = 0;
>> + event->new_fname_len = 0;
>> + event->err = err;
>> + memcpy(&msg->id, &cn_fs_event_id, sizeof(msg->id));
>> + msg->ack = rcvd_ack + 1;
>> + msg->len = sizeof(struct fsevent);
>> + cn_netlink_send(msg, CN_IDX_FS, GFP_KERNEL);
>> +}
>> +
>> +static void cn_fsevent_mode_ctl(void *data)
>> +{
>> + struct cn_msg *msg = data;
>> + enum fsevent_mode *mode = NULL;
>> + int err = 0;
>> +
>> + if (msg->len != sizeof(*mode))
>> + return;
>> +
>> + mode = (enum fsevent_mode*)msg->data;
>> + switch (*mode) {
>> + case FSEVENT_LISTEN:
>> + atomic_inc(&cn_fs_event_listeners);
>> + break;
>> + case FSEVENT_IGNORE:
>> + atomic_dec(&cn_fs_event_listeners);
>> + break;
>> + default:
>> + err = EINVAL;
>> + break;
>> + }
>> + cn_fs_event_ack(err, msg->seq, msg->ack);
>> +}
>> +
>> +static int __init cn_fs_init(void)
>> +{
>> + int err;
>> +
>> + if ((err = cn_add_callback(&cn_fs_event_id, "cn_fs",
>> + &cn_fsevent_mode_ctl))) {
>> + printk(KERN_WARNING "cn_fs failed to register\n");
>> + return err;
>> + }
>> + return 0;
>> +}
>> +
>> +module_init(cn_fs_init);
>>
>
>
>
>

2006-03-24 06:46:16

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [2.6.16 PATCH] Connector: Filesystem Events Connector

On Fri, 2006-03-24 at 08:45 +0800, Yi Yang wrote:
> Arjan van de Ven wrote:
> > On Wed, 2006-03-22 at 22:58 +0800, Yi Yang wrote:
> >
> >> This patch implements a new connector, Filesystem Event Connector,
> >> the user can monitor filesystem activities via it, currently, it
> >> can monitor access, attribute change, open, create, modify, delete,
> >> move and close of any file or directory.
> >>
> >
> >
> > how is this not redundant functionality with the audit subsystem ?
> >
> If you open Audit option, audit subsystem will audit all the syscalls,

no only those you subscribe to

and your other arguments... I don't buy them either.
If you want a small improvement to audit, DO THAT, not duplicate in
other code...


2006-03-24 07:47:36

by Matt Helsley

[permalink] [raw]
Subject: Re: [2.6.16 PATCH] Connector: Filesystem Events Connector

On Fri, 2006-03-24 at 09:21 +0800, Yi Yang wrote:
> Matt Helsley wrote:
>
> Thanks for Matt's careful review. I'll follow your advices to modify it
> and new version will be released soon.
> > On Wed, 2006-03-22 at 22:58 +0800, Yi Yang wrote:
> >

<snip>

> >> +int __raise_fsevent(const char * oldname, const char * newname, u32 mask)
> >>
> >
> > The return value of this function does not appear to be used.
> >
> If some modules want to use it to transfer file system events reliably,
> the return value will be very valuble,
> because the caller can retry the transfer until it successes.

Fair enough, though I hope you'll return -EFOO rather than -1, -2,...

<snip>

> >> + int namelen = 0;
> >> + static unsigned long last = 0;
> >> + static int fsevent_sum = 0;
> >>
> >
> > Yuck, static local variables. IMHO these should be globals. It would
> > make the fact they aren't protected from concurrent access more obvious
> > (see below).
> >
> Yes, they should be global.
> >
> >> + if (atomic_read(&cn_fs_event_listeners) < 1)
> >> + return 0;
> >> +
> >> + if (jiffies - last <= fsevent_ratelimit) {
> >> + if (fsevent_sum > fsevent_burst_limit) {
> >> + return -1;
> >>
> >
> > OK, so you're rate limiting the events. Shouldn't you still boost the
> > sequence number so that userspace knows some events got dropped? Also
> > perhaps you can find an appropriate error to return instead of -1.
> >
> Good idea.
> >
> >> + }
> >>
> >
> > remove unecessary braces
> >
> >
> >> + fsevent_sum++;
> >>
> >
> > Looks racy to me -- what's protecting fsevent_sum from access by
> > multiple cpus?
> >
> This just is used to limit event rate when the user space application
> leads to an unlimited events loop.
> so it mustn't be precise, I used spinlock originally, but Andrew thinks
> lock overhead is big, inotify has led to
> some frustrating lock overhead, so I decide to remove it, in fact
> fsevent_sum just is the number used to limit rate,
> some race conditions don't effect the rate limit.

OK, I can see why you would want to avoid a spinlock. However spinlocks
are not your only option. For instance you could use the per-cpu idioms
to limit the rate.

I would argue preemption should be disabled around the if-block at the
very least. Suppose your rate limit is 10k calls/sec and you have 4
procs. Each proc has a sequence of three instructions:

load fsevent_sum into register rx (rx <= 1000)
rx++ (rx <= 1001)
store contents of register rx in fsevent_sum (fsevent_sum <= 1001)


Now consider the following sequence of steps:

load fsevent_sum into rx (rx <= 1000)
<preempted>
<3 other processors each manage to increment the sum by 3333 bringing us
to 9999>
<resumed>
rx++ (rx <= 1001)
store contents of rx in fsevent_sum (fsevent_sum <= 1001)

So every processor now thinks it won't exceed the rate limit by
generating more events when in fact we've just exceeded the limit. So,
unless my example is flawed, I think you need to disable preemption
here.

Also, even if you simply disable preemption couldn't this cause the
cache line containing the sum to bounce frequently on large SMP systems?

<snip>

Cheers,
-Matt Helsley

2006-03-24 08:12:20

by Evgeniy Polyakov

[permalink] [raw]
Subject: Re: [2.6.16 PATCH] Connector: Filesystem Events Connector

On Thu, Mar 23, 2006 at 11:35:50PM -0800, Matt Helsley ([email protected]) wrote:
> I would argue preemption should be disabled around the if-block at the
> very least. Suppose your rate limit is 10k calls/sec and you have 4
> procs. Each proc has a sequence of three instructions:
>
> load fsevent_sum into register rx (rx <= 1000)
> rx++ (rx <= 1001)
> store contents of register rx in fsevent_sum (fsevent_sum <= 1001)
>
>
> Now consider the following sequence of steps:
>
> load fsevent_sum into rx (rx <= 1000)
> <preempted>
> <3 other processors each manage to increment the sum by 3333 bringing us
> to 9999>
> <resumed>
> rx++ (rx <= 1001)
> store contents of rx in fsevent_sum (fsevent_sum <= 1001)
>
> So every processor now thinks it won't exceed the rate limit by
> generating more events when in fact we've just exceeded the limit. So,
> unless my example is flawed, I think you need to disable preemption
> here.

Doesn't it just exceed the limit by one event per cpu?

> Also, even if you simply disable preemption couldn't this cause the
> cache line containing the sum to bounce frequently on large SMP systems?
>
> <snip>
>
> Cheers,
> -Matt Helsley

--
Evgeniy Polyakov

2006-03-24 09:55:30

by Matt Helsley

[permalink] [raw]
Subject: Re: [2.6.16 PATCH] Connector: Filesystem Events Connector

On Fri, 2006-03-24 at 11:11 +0300, Evgeniy Polyakov wrote:
> On Thu, Mar 23, 2006 at 11:35:50PM -0800, Matt Helsley ([email protected]) wrote:
> > I would argue preemption should be disabled around the if-block at the
> > very least. Suppose your rate limit is 10k calls/sec and you have 4
> > procs. Each proc has a sequence of three instructions:
> >
> > load fsevent_sum into register rx (rx <= 1000)
> > rx++ (rx <= 1001)
> > store contents of register rx in fsevent_sum (fsevent_sum <= 1001)
> >
> >
> > Now consider the following sequence of steps:
> >
> > load fsevent_sum into rx (rx <= 1000)
> > <preempted>
> > <3 other processors each manage to increment the sum by 3333 bringing us
> > to 9999>
> > <resumed>
> > rx++ (rx <= 1001)
> > store contents of rx in fsevent_sum (fsevent_sum <= 1001)
> >
> > So every processor now thinks it won't exceed the rate limit by
> > generating more events when in fact we've just exceeded the limit. So,
> > unless my example is flawed, I think you need to disable preemption
> > here.
>
> Doesn't it just exceed the limit by one event per cpu?

The example exceeds it by one at the time of the final store. Thanks to
the fact that the value is then 1001 it may shortly be exceeded by much
more than 1.

Cheers,
-Matt Helsley

2006-03-24 10:07:08

by Matt Helsley

[permalink] [raw]
Subject: Re: [2.6.16 PATCH] Connector: Filesystem Events Connector

On Fri, 2006-03-24 at 10:50 +0800, Yi Yang wrote:
> Matt Helsley wrote:
> > On Wed, 2006-03-22 at 22:58 +0800, Yi Yang wrote:

<snip>

> >> +struct fsevent {
> >> + __u32 type;
> >> + __u32 cpu;
> >> + struct timespec timestamp;
> >> + __u32 tgid;
> >>
> >
> > I think pid_t would be more appropriate here instead of a __u32.
> > Also a tid field of type pid_t would be consistent with the Process
> > Events returned to userspace. If a userspace program ever wanted to
> > relate the two sets of events the tid field could be important. That
> > said it would be nice to know if any userspace programs are planning on
> > using this.
> >
>
> I think size of pid_t must be changed from 2.4 to 2.6, in Redhat 9.0, I
> find size of pid_t in user space is
> different from kernel space, I believe that is the problem of stale
> header files.

I'm not sure why a connector would have to be concerned with the size of
pid_t in linux-2.4.x unless it's backported since, as far as I'm aware,
connectors have not been backported.

> >
> >> + __u32 uid;
> >> + __u32 gid;
> >> + __u32 err;
> >>
> >
> > The err field appears to be unused when sending events back to
> > userspace. I suggest reusing it for the event mask and using the type
> > field to indicate whether the message is a CMDACK or simply an EVENT.
> >
> err is useful when you use unknown CMD for the connector, ACK will tell
> the user CMD is invalid.

Yes, but the point is for an event -- which presumably is *much* more
frequent that a CMD -- the err field would be unused. Hence my
suggestion of having it serve as the mask field for event messages
instead of combining the event mask into the type field.

<snip>

Cheers,
-Matt Helsley

2006-03-24 10:10:14

by Evgeniy Polyakov

[permalink] [raw]
Subject: Re: [2.6.16 PATCH] Connector: Filesystem Events Connector

On Fri, Mar 24, 2006 at 01:42:17AM -0800, Matt Helsley ([email protected]) wrote:
> On Fri, 2006-03-24 at 11:11 +0300, Evgeniy Polyakov wrote:
> > On Thu, Mar 23, 2006 at 11:35:50PM -0800, Matt Helsley ([email protected]) wrote:
> > > I would argue preemption should be disabled around the if-block at the
> > > very least. Suppose your rate limit is 10k calls/sec and you have 4
> > > procs. Each proc has a sequence of three instructions:
> > >
> > > load fsevent_sum into register rx (rx <= 1000)
> > > rx++ (rx <= 1001)
> > > store contents of register rx in fsevent_sum (fsevent_sum <= 1001)
> > >
> > >
> > > Now consider the following sequence of steps:
> > >
> > > load fsevent_sum into rx (rx <= 1000)
> > > <preempted>
> > > <3 other processors each manage to increment the sum by 3333 bringing us
> > > to 9999>
> > > <resumed>
> > > rx++ (rx <= 1001)
> > > store contents of rx in fsevent_sum (fsevent_sum <= 1001)
> > >
> > > So every processor now thinks it won't exceed the rate limit by
> > > generating more events when in fact we've just exceeded the limit. So,
> > > unless my example is flawed, I think you need to disable preemption
> > > here.
> >
> > Doesn't it just exceed the limit by one event per cpu?
>
> The example exceeds it by one at the time of the final store. Thanks to
> the fact that the value is then 1001 it may shortly be exceeded by much
> more than 1.

+
+ if (jiffies - last <= fsevent_ratelimit) {
+ if (fsevent_sum > fsevent_burst_limit)
+ return -2;
+ fsevent_sum++;

Only process (and not process' syscall) can preempt us here,
so fsevent_sum can only exceed fsevent_burst_limit by one per process
(process can not preempt itself, so when it has finished syscall which
ends up in event generation, fsevent_sum will be increased).

+ } else {
+ last = jiffies;
+ fsevent_sum = 0;
+ }

Actually, since jiffies and atomic operations are already used, I do not
think addition of new atomic_inc_return or something similar will
even somehow change the picture.


> Cheers,
> -Matt Helsley

--
Evgeniy Polyakov

2006-03-24 14:04:15

by Yi Yang

[permalink] [raw]
Subject: Re: Connector: Filesystem Events Connector

On 3/24/06, Evgeniy Polyakov <[email protected]> wrote:
> On Fri, Mar 24, 2006 at 01:42:17AM -0800, Matt Helsley ([email protected])
> wrote:
> > On Fri, 2006-03-24 at 11:11 +0300, Evgeniy Polyakov wrote:
> > > On Thu, Mar 23, 2006 at 11:35:50PM -0800, Matt Helsley
> ([email protected]) wrote:
> > > > I would argue preemption should be disabled around the if-block at the
> > > > very least. Suppose your rate limit is 10k calls/sec and you have 4
> > > > procs. Each proc has a sequence of three instructions:
> > > >
> > > > load fsevent_sum into register rx (rx <= 1000)
> > > > rx++ (rx <= 1001)
> > > > store contents of register rx in fsevent_sum (fsevent_sum <= 1001)
> > > >
> > > >
> > > > Now consider the following sequence of steps:
> > > >
> > > > load fsevent_sum into rx (rx <= 1000)
> > > > <preempted>
> > > > <3 other processors each manage to increment the sum by 3333 bringing
> us
> > > > to 9999>
> > > > <resumed>
> > > > rx++ (rx <= 1001)
> > > > store contents of rx in fsevent_sum (fsevent_sum <= 1001)
> > > >
> > > > So every processor now thinks it won't exceed the rate limit by
> > > > generating more events when in fact we've just exceeded the limit. So,
> > > > unless my example is flawed, I think you need to disable preemption
> > > > here.
> > >
> > > Doesn't it just exceed the limit by one event per cpu?
> >
> > The example exceeds it by one at the time of the final store. Thanks to
> > the fact that the value is then 1001 it may shortly be exceeded by much
> > more than 1.
>
> +
> + if (jiffies - last <= fsevent_ratelimit) {
> + if (fsevent_sum > fsevent_burst_limit)
> + return -2;
> + fsevent_sum++;
>
> Only process (and not process' syscall) can preempt us here,
> so fsevent_sum can only exceed fsevent_burst_limit by one per process
> (process can not preempt itself, so when it has finished syscall which
> ends up in event generation, fsevent_sum will be increased).
>
> + } else {
> + last = jiffies;
> + fsevent_sum = 0;
> + }
>
> Actually, since jiffies and atomic operations are already used, I do not
> think addition of new atomic_inc_return or something similar will
> even somehow change the picture.
rate limit is just for some abnormal cases, for example, an
application leads to a unlimited event loop, it should be very few, in
the worst case, the user application which induced this case will
terminate, this should be a reasonable result.
rate limit is intent to prevent this kind of case and avoid DoS of system.
>
>
> > Cheers,
> > -Matt Helsley
>
> --
> Evgeniy Polyakov
>