Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751008AbWIFNoo (ORCPT ); Wed, 6 Sep 2006 09:44:44 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1751003AbWIFNoo (ORCPT ); Wed, 6 Sep 2006 09:44:44 -0400 Received: from relay03.pair.com ([209.68.5.17]:24850 "HELO relay03.pair.com") by vger.kernel.org with SMTP id S1751008AbWIFNnH (ORCPT ); Wed, 6 Sep 2006 09:43:07 -0400 X-pair-Authenticated: 71.197.50.189 Date: Wed, 6 Sep 2006 08:40:21 -0500 (CDT) From: Chase Venters X-X-Sender: root@turbotaz.ourhouse To: Evgeniy Polyakov cc: lkml , David Miller , Ulrich Drepper , Andrew Morton , netdev , Zach Brown , Christoph Hellwig , Chase Venters , Johann Borck Subject: Re: [take16 1/4] kevent: Core files. In-Reply-To: <1157543723488@2ka.mipt.ru> Message-ID: References: <1157543723488@2ka.mipt.ru> MIME-Version: 1.0 Content-Type: TEXT/PLAIN; charset=US-ASCII; format=flowed Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 53256 Lines: 1862 Evgeniy, Sorry about the radio silence later. Some reviewer commentary follows. On Wed, 6 Sep 2006, Evgeniy Polyakov wrote: > > Core files. > > This patch includes core kevent files: > - userspace controlling > - kernelspace interfaces > - initialization > - notification state machines > > Signed-off-by: Evgeniy Polyakov > > diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S > index dd63d47..c10698e 100644 > --- a/arch/i386/kernel/syscall_table.S > +++ b/arch/i386/kernel/syscall_table.S > @@ -317,3 +317,6 @@ ENTRY(sys_call_table) > .long sys_tee /* 315 */ > .long sys_vmsplice > .long sys_move_pages > + .long sys_kevent_get_events > + .long sys_kevent_ctl > + .long sys_kevent_wait /* 320 */ > diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S > index 5d4a7d1..a06b76f 100644 > --- a/arch/x86_64/ia32/ia32entry.S > +++ b/arch/x86_64/ia32/ia32entry.S > @@ -710,7 +710,10 @@ #endif > .quad compat_sys_get_robust_list > .quad sys_splice > .quad sys_sync_file_range > - .quad sys_tee > + .quad sys_tee /* 315 */ > .quad compat_sys_vmsplice > .quad compat_sys_move_pages > + .quad sys_kevent_get_events > + .quad sys_kevent_ctl > + .quad sys_kevent_wait /* 320 */ > ia32_syscall_end: > diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h > index fc1c8dd..68072b5 100644 > --- a/include/asm-i386/unistd.h > +++ b/include/asm-i386/unistd.h > @@ -323,10 +323,13 @@ #define __NR_sync_file_range 314 > #define __NR_tee 315 > #define __NR_vmsplice 316 > #define __NR_move_pages 317 > +#define __NR_kevent_get_events 318 > +#define __NR_kevent_ctl 319 > +#define __NR_kevent_wait 320 > > #ifdef __KERNEL__ > > -#define NR_syscalls 318 > +#define NR_syscalls 321 > > /* > * user-visible error numbers are in the range -1 - -128: see > diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h > index 94387c9..ee907ad 100644 > --- a/include/asm-x86_64/unistd.h > +++ b/include/asm-x86_64/unistd.h > @@ -619,10 +619,16 @@ #define __NR_vmsplice 278 > __SYSCALL(__NR_vmsplice, sys_vmsplice) > #define __NR_move_pages 279 > __SYSCALL(__NR_move_pages, sys_move_pages) > +#define __NR_kevent_get_events 280 > +__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events) > +#define __NR_kevent_ctl 281 > +__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl) > +#define __NR_kevent_wait 282 > +__SYSCALL(__NR_kevent_wait, sys_kevent_wait) > > #ifdef __KERNEL__ > > -#define __NR_syscall_max __NR_move_pages > +#define __NR_syscall_max __NR_kevent_wait > > #ifndef __NO_STUBS > > diff --git a/include/linux/kevent.h b/include/linux/kevent.h > new file mode 100644 > index 0000000..67007f2 > --- /dev/null > +++ b/include/linux/kevent.h > @@ -0,0 +1,196 @@ > +/* > + * 2006 Copyright (c) Evgeniy Polyakov > + * All rights reserved. > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, write to the Free Software > + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA > + */ > + > +#ifndef __KEVENT_H > +#define __KEVENT_H > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +#define KEVENT_MIN_BUFFS_ALLOC 3 > + > +struct kevent; > +struct kevent_storage; > +typedef int (* kevent_callback_t)(struct kevent *); > + > +/* @callback is called each time new event has been caught. */ > +/* @enqueue is called each time new event is queued. */ > +/* @dequeue is called each time event is dequeued. */ > + > +struct kevent_callbacks { > + kevent_callback_t callback, enqueue, dequeue; > +}; > + > +#define KEVENT_READY 0x1 > +#define KEVENT_STORAGE 0x2 > +#define KEVENT_USER 0x4 > + > +struct kevent > +{ > + /* Used for kevent freeing.*/ > + struct rcu_head rcu_head; > + struct ukevent event; > + /* This lock protects ukevent manipulations, e.g. ret_flags changes. */ > + spinlock_t ulock; > + > + /* Entry of user's queue. */ > + struct list_head kevent_entry; > + /* Entry of origin's queue. */ > + struct list_head storage_entry; > + /* Entry of user's ready. */ > + struct list_head ready_entry; > + > + u32 flags; > + > + /* User who requested this kevent. */ > + struct kevent_user *user; > + /* Kevent container. */ > + struct kevent_storage *st; > + > + struct kevent_callbacks callbacks; > + > + /* Private data for different storages. > + * poll()/select storage has a list of wait_queue_t containers > + * for each ->poll() { poll_wait()' } here. > + */ > + void *priv; > +}; > + > +#define KEVENT_HASH_MASK 0xff > + > +struct kevent_user > +{ These structure names get a little dicey (kevent, kevent_user, ukevent, mukevent)... might there be slightly different names that could be selected to better distinguish the purpose of each? > + struct list_head kevent_list[KEVENT_HASH_MASK+1]; > + spinlock_t kevent_lock; > + /* Number of queued kevents. */ > + unsigned int kevent_num; > + > + /* List of ready kevents. */ > + struct list_head ready_list; > + /* Number of ready kevents. */ > + unsigned int ready_num; > + /* Protects all manipulations with ready queue. */ > + spinlock_t ready_lock; > + > + /* Protects against simultaneous kevent_user control manipulations. */ > + struct mutex ctl_mutex; > + /* Wait until some events are ready. */ > + wait_queue_head_t wait; > + > + /* Reference counter, increased for each new kevent. */ > + atomic_t refcnt; > + > + unsigned int pages_in_use; > + /* Array of pages forming mapped ring buffer */ > + struct kevent_mring **pring; > + > +#ifdef CONFIG_KEVENT_USER_STAT > + unsigned long im_num; > + unsigned long wait_num; > + unsigned long total; > +#endif > +}; > + > +int kevent_enqueue(struct kevent *k); > +int kevent_dequeue(struct kevent *k); > +int kevent_init(struct kevent *k); > +void kevent_requeue(struct kevent *k); > +int kevent_break(struct kevent *k); > + > +int kevent_add_callbacks(const struct kevent_callbacks *cb, int pos); > + > +void kevent_user_ring_add_event(struct kevent *k); > + > +void kevent_storage_ready(struct kevent_storage *st, > + kevent_callback_t ready_callback, u32 event); > +int kevent_storage_init(void *origin, struct kevent_storage *st); > +void kevent_storage_fini(struct kevent_storage *st); > +int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k); > +void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k); > + > +int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u); > + > +#ifdef CONFIG_KEVENT_POLL > +void kevent_poll_reinit(struct file *file); > +#else > +static inline void kevent_poll_reinit(struct file *file) > +{ > +} > +#endif > + > +#ifdef CONFIG_KEVENT_USER_STAT > +static inline void kevent_stat_init(struct kevent_user *u) > +{ > + u->wait_num = u->im_num = u->total = 0; > +} > +static inline void kevent_stat_print(struct kevent_user *u) > +{ > + pr_debug("%s: u=%p, wait=%lu, immediately=%lu, total=%lu.\n", > + __func__, u, u->wait_num, u->im_num, u->total); > +} > +static inline void kevent_stat_im(struct kevent_user *u) > +{ > + u->im_num++; > +} > +static inline void kevent_stat_wait(struct kevent_user *u) > +{ > + u->wait_num++; > +} > +static inline void kevent_stat_total(struct kevent_user *u) > +{ > + u->total++; > +} > +#else > +#define kevent_stat_print(u) ({ (void) u;}) > +#define kevent_stat_init(u) ({ (void) u;}) > +#define kevent_stat_im(u) ({ (void) u;}) > +#define kevent_stat_wait(u) ({ (void) u;}) > +#define kevent_stat_total(u) ({ (void) u;}) > +#endif > + > +#ifdef CONFIG_KEVENT_SOCKET > +#ifdef CONFIG_LOCKDEP > +void kevent_socket_reinit(struct socket *sock); > +void kevent_sk_reinit(struct sock *sk); > +#else > +static inline void kevent_socket_reinit(struct socket *sock) > +{ > +} > +static inline void kevent_sk_reinit(struct sock *sk) > +{ > +} > +#endif > +void kevent_socket_notify(struct sock *sock, u32 event); > +int kevent_socket_dequeue(struct kevent *k); > +int kevent_socket_enqueue(struct kevent *k); > +#define sock_async(__sk) sock_flag(__sk, SOCK_ASYNC) > +#else > +static inline void kevent_socket_notify(struct sock *sock, u32 event) > +{ > +} > +#define sock_async(__sk) ({ (void)__sk; 0; }) > +#endif > + > +#endif /* __KEVENT_H */ > diff --git a/include/linux/kevent_storage.h b/include/linux/kevent_storage.h > new file mode 100644 > index 0000000..a38575d > --- /dev/null > +++ b/include/linux/kevent_storage.h > @@ -0,0 +1,11 @@ > +#ifndef __KEVENT_STORAGE_H > +#define __KEVENT_STORAGE_H > + > +struct kevent_storage > +{ > + void *origin; /* Originator's pointer, e.g. struct sock or struct file. Can be NULL. */ > + struct list_head list; /* List of queued kevents. */ > + spinlock_t lock; /* Protects users queue. */ > +}; > + > +#endif /* __KEVENT_STORAGE_H */ > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > index 008f04c..9d4690f 100644 > --- a/include/linux/syscalls.h > +++ b/include/linux/syscalls.h > @@ -597,4 +597,8 @@ asmlinkage long sys_get_robust_list(int > asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, > size_t len); > > +asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min, unsigned int max, > + __u64 timeout, struct ukevent __user *buf, unsigned flags); > +asmlinkage long sys_kevent_ctl(int ctl_fd, unsigned int cmd, unsigned int num, struct ukevent __user *buf); > +asmlinkage long sys_kevent_wait(int ctl_fd, unsigned int start, unsigned int num, __u64 timeout); > #endif > diff --git a/include/linux/ukevent.h b/include/linux/ukevent.h > new file mode 100644 > index 0000000..f8ff3a2 > --- /dev/null > +++ b/include/linux/ukevent.h > @@ -0,0 +1,155 @@ > +/* > + * 2006 Copyright (c) Evgeniy Polyakov > + * All rights reserved. > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, write to the Free Software > + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA > + */ > + > +#ifndef __UKEVENT_H > +#define __UKEVENT_H > + > +/* > + * Kevent request flags. > + */ > + > +/* Process this event only once and then dequeue. */ > +#define KEVENT_REQ_ONESHOT 0x1 > + > +/* > + * Kevent return flags. > + */ > +/* Kevent is broken. */ > +#define KEVENT_RET_BROKEN 0x1 > +/* Kevent processing was finished successfully. */ > +#define KEVENT_RET_DONE 0x2 > + > +/* > + * Kevent type set. > + */ > +#define KEVENT_SOCKET 0 > +#define KEVENT_INODE 1 > +#define KEVENT_TIMER 2 > +#define KEVENT_POLL 3 > +#define KEVENT_NAIO 4 > +#define KEVENT_AIO 5 > +#define KEVENT_MAX 6 > + > +/* > + * Per-type event sets. > + * Number of per-event sets should be exactly as number of kevent types. > + */ > + > +/* > + * Timer events. > + */ > +#define KEVENT_TIMER_FIRED 0x1 > + > +/* > + * Socket/network asynchronous IO events. > + */ > +#define KEVENT_SOCKET_RECV 0x1 > +#define KEVENT_SOCKET_ACCEPT 0x2 > +#define KEVENT_SOCKET_SEND 0x4 > + > +/* > + * Inode events. > + */ > +#define KEVENT_INODE_CREATE 0x1 > +#define KEVENT_INODE_REMOVE 0x2 > + > +/* > + * Poll events. > + */ > +#define KEVENT_POLL_POLLIN 0x0001 > +#define KEVENT_POLL_POLLPRI 0x0002 > +#define KEVENT_POLL_POLLOUT 0x0004 > +#define KEVENT_POLL_POLLERR 0x0008 > +#define KEVENT_POLL_POLLHUP 0x0010 > +#define KEVENT_POLL_POLLNVAL 0x0020 > + > +#define KEVENT_POLL_POLLRDNORM 0x0040 > +#define KEVENT_POLL_POLLRDBAND 0x0080 > +#define KEVENT_POLL_POLLWRNORM 0x0100 > +#define KEVENT_POLL_POLLWRBAND 0x0200 > +#define KEVENT_POLL_POLLMSG 0x0400 > +#define KEVENT_POLL_POLLREMOVE 0x1000 > + > +/* > + * Asynchronous IO events. > + */ > +#define KEVENT_AIO_BIO 0x1 > + > +#define KEVENT_MASK_ALL 0xffffffff > +/* Mask of all possible event values. */ > +#define KEVENT_MASK_EMPTY 0x0 > +/* Empty mask of ready events. */ > + > +struct kevent_id > +{ > + union { > + __u32 raw[2]; > + __u64 raw_u64 __attribute__((aligned(8))); > + }; > +}; > + > +struct ukevent > +{ > + /* Id of this request, e.g. socket number, file descriptor and so on... */ > + struct kevent_id id; > + /* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */ > + __u32 type; > + /* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED... */ > + __u32 event; > + /* Per-event request flags */ > + __u32 req_flags; > + /* Per-event return flags */ > + __u32 ret_flags; > + /* Event return data. Event originator fills it with anything it likes. */ > + __u32 ret_data[2]; > + /* User's data. It is not used, just copied to/from user. > + * The whole structure is aligned to 8 bytes already, so the last union > + * is aligned properly. > + */ > + union { > + __u32 user[2]; > + void *ptr; > + }; > +}; > + > +struct mukevent > +{ > + struct kevent_id id; > + __u32 ret_flags; > +}; > + > +#define KEVENT_MAX_EVENTS 4096 > + This limit governs how many simultaneous kevents you can be waiting on / for at once, correct? Would it be possible to drop the hard limit and limit instead, say, the maximum number of kevents you can have pending in the mmap ring-buffer? After the number is exceeded, additional events could get dropped, or some magic number could be put in the kevent_mring->index field to let the process know that it must hit another syscall to drain the rest of the events. > +/* > + * Note that kevents does not exactly fill the page (each mukevent is 12 bytes), > + * so we reuse 4 bytes at the begining of the first page to store index. > + * Take that into account if you want to change size of struct mukevent. > + */ > +#define KEVENTS_ON_PAGE ((PAGE_SIZE-sizeof(unsigned int))/sizeof(struct mukevent)) > +struct kevent_mring > +{ > + unsigned int index; > + struct mukevent event[KEVENTS_ON_PAGE]; > +}; > + > +#define KEVENT_CTL_ADD 0 > +#define KEVENT_CTL_REMOVE 1 > +#define KEVENT_CTL_MODIFY 2 > + > +#endif /* __UKEVENT_H */ > diff --git a/init/Kconfig b/init/Kconfig > index a099fc6..c550fcc 100644 > --- a/init/Kconfig > +++ b/init/Kconfig > @@ -218,6 +218,8 @@ config AUDITSYSCALL > such as SELinux. To use audit's filesystem watch feature, please > ensure that INOTIFY is configured. > > +source "kernel/kevent/Kconfig" > + > config IKCONFIG > bool "Kernel .config support" > ---help--- > diff --git a/kernel/Makefile b/kernel/Makefile > index d62ec66..2d7a6dd 100644 > --- a/kernel/Makefile > +++ b/kernel/Makefile > @@ -47,6 +47,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softl > obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ > obj-$(CONFIG_SECCOMP) += seccomp.o > obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o > +obj-$(CONFIG_KEVENT) += kevent/ > obj-$(CONFIG_RELAY) += relay.o > obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o > obj-$(CONFIG_TASKSTATS) += taskstats.o > diff --git a/kernel/kevent/Kconfig b/kernel/kevent/Kconfig > new file mode 100644 > index 0000000..85ad472 > --- /dev/null > +++ b/kernel/kevent/Kconfig > @@ -0,0 +1,40 @@ > +config KEVENT > + bool "Kernel event notification mechanism" > + help > + This option enables event queue mechanism. > + It can be used as replacement for poll()/select(), AIO callback > + invocations, advanced timer notifications and other kernel > + object status changes. > + > +config KEVENT_USER_STAT > + bool "Kevent user statistic" > + depends on KEVENT > + default N > + help > + This option will turn kevent_user statistic collection on. > + Statistic data includes total number of kevent, number of kevents > + which are ready immediately at insertion time and number of kevents > + which were removed through readiness completion. > + It will be printed each time control kevent descriptor is closed. > + > +config KEVENT_TIMER > + bool "Kernel event notifications for timers" > + depends on KEVENT > + help > + This option allows to use timers through KEVENT subsystem. > + > +config KEVENT_POLL > + bool "Kernel event notifications for poll()/select()" > + depends on KEVENT > + help > + This option allows to use kevent subsystem for poll()/select() > + notifications. > + > +config KEVENT_SOCKET > + bool "Kernel event notifications for sockets" > + depends on NET && KEVENT > + help > + This option enables notifications through KEVENT subsystem of > + sockets operations, like new packet receiving conditions, > + ready for accept conditions and so on. > + > diff --git a/kernel/kevent/Makefile b/kernel/kevent/Makefile > new file mode 100644 > index 0000000..9130cad > --- /dev/null > +++ b/kernel/kevent/Makefile > @@ -0,0 +1,4 @@ > +obj-y := kevent.o kevent_user.o > +obj-$(CONFIG_KEVENT_TIMER) += kevent_timer.o > +obj-$(CONFIG_KEVENT_POLL) += kevent_poll.o > +obj-$(CONFIG_KEVENT_SOCKET) += kevent_socket.o > diff --git a/kernel/kevent/kevent.c b/kernel/kevent/kevent.c > new file mode 100644 > index 0000000..422f585 > --- /dev/null > +++ b/kernel/kevent/kevent.c > @@ -0,0 +1,227 @@ > +/* > + * 2006 Copyright (c) Evgeniy Polyakov > + * All rights reserved. > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, write to the Free Software > + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA > + */ > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +/* > + * Attempts to add an event into appropriate origin's queue. > + * Returns positive value if this event is ready immediately, > + * negative value in case of error and zero if event has been queued. > + * ->enqueue() callback must increase origin's reference counter. > + */ > +int kevent_enqueue(struct kevent *k) > +{ > + return k->callbacks.enqueue(k); > +} > + > +/* > + * Remove event from the appropriate queue. > + * ->dequeue() callback must decrease origin's reference counter. > + */ > +int kevent_dequeue(struct kevent *k) > +{ > + return k->callbacks.dequeue(k); > +} > + > +/* > + * Mark kevent as broken. > + */ > +int kevent_break(struct kevent *k) > +{ > + unsigned long flags; > + > + spin_lock_irqsave(&k->ulock, flags); > + k->event.ret_flags |= KEVENT_RET_BROKEN; > + spin_unlock_irqrestore(&k->ulock, flags); > + return -EINVAL; > +} > + > +static struct kevent_callbacks kevent_registered_callbacks[KEVENT_MAX]; __read_mostly? > + > +int kevent_add_callbacks(const struct kevent_callbacks *cb, int pos) > +{ > + struct kevent_callbacks *p; > + > + if (pos >= KEVENT_MAX) > + return -EINVAL; > + > + p = &kevent_registered_callbacks[pos]; > + > + p->enqueue = (cb->enqueue) ? cb->enqueue : kevent_break; > + p->dequeue = (cb->dequeue) ? cb->dequeue : kevent_break; > + p->callback = (cb->callback) ? cb->callback : kevent_break; Curious... why are these callbacks copied, rather than just retaining a pointer to a const/static "ops" structure? > + > + printk(KERN_INFO "KEVENT: Added callbacks for type %d.\n", pos); Is this printk() chatter necessary? > + return 0; > +} > + > +/* > + * Must be called before event is going to be added into some origin's queue. > + * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks. > + * If failed, kevent should not be used or kevent_enqueue() will fail to add > + * this kevent into origin's queue with setting > + * KEVENT_RET_BROKEN flag in kevent->event.ret_flags. > + */ > +int kevent_init(struct kevent *k) > +{ > + spin_lock_init(&k->ulock); > + k->flags = 0; > + > + if (unlikely(k->event.type >= KEVENT_MAX || > + !kevent_registered_callbacks[k->event.type].callback)) > + return kevent_break(k); > + > + k->callbacks = kevent_registered_callbacks[k->event.type]; > + if (unlikely(k->callbacks.callback == kevent_break)) > + return kevent_break(k); > + > + return 0; > +} > + > +/* > + * Called from ->enqueue() callback when reference counter for given > + * origin (socket, inode...) has been increased. > + */ > +int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k) > +{ > + unsigned long flags; > + > + k->st = st; > + spin_lock_irqsave(&st->lock, flags); > + list_add_tail_rcu(&k->storage_entry, &st->list); > + k->flags |= KEVENT_STORAGE; > + spin_unlock_irqrestore(&st->lock, flags); > + return 0; > +} > + > +/* > + * Dequeue kevent from origin's queue. > + * It does not decrease origin's reference counter in any way > + * and must be called before it, so storage itself must be valid. > + * It is called from ->dequeue() callback. > + */ > +void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k) > +{ > + unsigned long flags; > + > + spin_lock_irqsave(&st->lock, flags); > + if (k->flags & KEVENT_STORAGE) { > + list_del_rcu(&k->storage_entry); > + k->flags &= ~KEVENT_STORAGE; > + } > + spin_unlock_irqrestore(&st->lock, flags); > +} > + > +/* > + * Call kevent ready callback and queue it into ready queue if needed. > + * If kevent is marked as one-shot, then remove it from storage queue. > + */ > +static void __kevent_requeue(struct kevent *k, u32 event) > +{ > + int ret, rem; > + unsigned long flags; > + > + ret = k->callbacks.callback(k); > + > + spin_lock_irqsave(&k->ulock, flags); > + if (ret > 0) > + k->event.ret_flags |= KEVENT_RET_DONE; > + else if (ret < 0) > + k->event.ret_flags |= (KEVENT_RET_BROKEN | KEVENT_RET_DONE); > + else > + ret = (k->event.ret_flags & (KEVENT_RET_BROKEN|KEVENT_RET_DONE)); > + rem = (k->event.req_flags & KEVENT_REQ_ONESHOT); > + spin_unlock_irqrestore(&k->ulock, flags); > + > + if (ret) { > + if ((rem || ret < 0) && (k->flags & KEVENT_STORAGE)) { > + list_del_rcu(&k->storage_entry); > + k->flags &= ~KEVENT_STORAGE; > + } > + > + spin_lock_irqsave(&k->user->ready_lock, flags); > + if (!(k->flags & KEVENT_READY)) { > + kevent_user_ring_add_event(k); > + list_add_tail(&k->ready_entry, &k->user->ready_list); > + k->flags |= KEVENT_READY; > + k->user->ready_num++; > + } > + spin_unlock_irqrestore(&k->user->ready_lock, flags); > + wake_up(&k->user->wait); > + } > +} > + > +/* > + * Check if kevent is ready (by invoking it's callback) and requeue/remove > + * if needed. > + */ > +void kevent_requeue(struct kevent *k) > +{ > + unsigned long flags; > + > + spin_lock_irqsave(&k->st->lock, flags); > + __kevent_requeue(k, 0); > + spin_unlock_irqrestore(&k->st->lock, flags); > +} > + > +/* > + * Called each time some activity in origin (socket, inode...) is noticed. > + */ > +void kevent_storage_ready(struct kevent_storage *st, > + kevent_callback_t ready_callback, u32 event) > +{ > + struct kevent *k; > + > + rcu_read_lock(); > + if (ready_callback) > + list_for_each_entry_rcu(k, &st->list, storage_entry) > + (*ready_callback)(k); > + > + list_for_each_entry_rcu(k, &st->list, storage_entry) > + if (event & k->event.event) > + __kevent_requeue(k, event); > + rcu_read_unlock(); > +} > + > +int kevent_storage_init(void *origin, struct kevent_storage *st) > +{ > + spin_lock_init(&st->lock); > + st->origin = origin; > + INIT_LIST_HEAD(&st->list); > + return 0; > +} > + > +/* > + * Mark all events as broken, that will remove them from storage, > + * so storage origin (inode, sockt and so on) can be safely removed. > + * No new entries are allowed to be added into the storage at this point. > + * (Socket is removed from file table at this point for example). > + */ > +void kevent_storage_fini(struct kevent_storage *st) > +{ > + kevent_storage_ready(st, kevent_break, KEVENT_MASK_ALL); > +} > diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c > new file mode 100644 > index 0000000..ff1a770 > --- /dev/null > +++ b/kernel/kevent/kevent_user.c > @@ -0,0 +1,968 @@ > +/* > + * 2006 Copyright (c) Evgeniy Polyakov > + * All rights reserved. > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, write to the Free Software > + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA > + */ > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +static char kevent_name[] = "kevent"; const? > +static kmem_cache_t *kevent_cache; > + > +/* > + * kevents are pollable, return POLLIN and POLLRDNORM > + * when there is at least one ready kevent. > + */ > +static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait) > +{ > + struct kevent_user *u = file->private_data; > + unsigned int mask; > + > + poll_wait(file, &u->wait, wait); > + mask = 0; > + > + if (u->ready_num) > + mask |= POLLIN | POLLRDNORM; > + > + return mask; > +} > + > +static inline void kevent_user_ring_set(struct kevent_user *u, unsigned int num) > +{ > + u->pring[0]->index = num; > +} > + > +static int kevent_user_ring_grow(struct kevent_user *u) > +{ > + unsigned int idx; > + > + idx = (u->pring[0]->index + 1) / KEVENTS_ON_PAGE; > + if (idx >= u->pages_in_use) { > + u->pring[idx] = (void *)__get_free_page(GFP_KERNEL); > + if (!u->pring[idx]) > + return -ENOMEM; > + u->pages_in_use++; > + } > + return 0; > +} > + > +/* > + * Called under kevent_user->ready_lock, so updates are always protected. > + */ > +void kevent_user_ring_add_event(struct kevent *k) > +{ > + unsigned int pidx, off; > + struct kevent_mring *ring, *copy_ring; > + > + ring = k->user->pring[0]; > + > + pidx = ring->index/KEVENTS_ON_PAGE; > + off = ring->index%KEVENTS_ON_PAGE; > + > + if (unlikely(pidx >= k->user->pages_in_use)) { > + printk("%s: ring->index: %u, pidx: %u, pages_in_use: %u.\n", > + __func__, ring->index, pidx, k->user->pages_in_use); > + WARN_ON(1); > + return; > + } > + > + copy_ring = k->user->pring[pidx]; > + > + copy_ring->event[off].id.raw[0] = k->event.id.raw[0]; > + copy_ring->event[off].id.raw[1] = k->event.id.raw[1]; > + copy_ring->event[off].ret_flags = k->event.ret_flags; > + > + if (++ring->index >= KEVENT_MAX_EVENTS) > + ring->index = 0; > +} > + > +/* > + * Initialize mmap ring buffer. > + * It will store ready kevents, so userspace could get them directly instead > + * of using syscall. Esentially syscall becomes just a waiting point. > + */ > +static int kevent_user_ring_init(struct kevent_user *u) > +{ > + int pnum; > + > + pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct mukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE; This calculation works with the current constants, but it comes up a page short if, say, KEVENT_MAX_EVENTS were 4095. It also looks incorrect visually since the 'sizeof(unsigned int)' is only factored in once (rather than once per page). I suggest a static / inline __max_kevent_pages() function that either does: return KEVENT_MAX_EVENTS / KEVENTS_ON_PAGE + 1; or int pnum = KEVENT_MAX_EVENTS / KEVENTS_ON_PAGE; if (KEVENT_MAX_EVENTS % KEVENTS_ON_PAGE) pnum++; return pnum; Both should be optimized away by the compiler and will give correct answers regardless of the constant values. > + > + u->pring = kmalloc(pnum * sizeof(struct kevent_mring *), GFP_KERNEL); > + if (!u->pring) > + return -ENOMEM; > + > + u->pring[0] = (struct kevent_mring *)__get_free_page(GFP_KERNEL); > + if (!u->pring[0]) > + goto err_out_free; > + > + u->pages_in_use = 1; > + kevent_user_ring_set(u, 0); > + > + return 0; > + > +err_out_free: > + kfree(u->pring); > + > + return -ENOMEM; > +} > + > +static void kevent_user_ring_fini(struct kevent_user *u) > +{ > + int i; > + > + for (i = 0; i < u->pages_in_use; ++i) > + free_page((unsigned long)u->pring[i]); > + > + kfree(u->pring); > +} > + > +static int kevent_user_open(struct inode *inode, struct file *file) > +{ > + struct kevent_user *u; > + int i; > + > + u = kzalloc(sizeof(struct kevent_user), GFP_KERNEL); > + if (!u) > + return -ENOMEM; > + > + INIT_LIST_HEAD(&u->ready_list); > + spin_lock_init(&u->ready_lock); > + kevent_stat_init(u); > + spin_lock_init(&u->kevent_lock); > + for (i = 0; i < ARRAY_SIZE(u->kevent_list); ++i) > + INIT_LIST_HEAD(&u->kevent_list[i]); > + > + mutex_init(&u->ctl_mutex); > + init_waitqueue_head(&u->wait); > + > + atomic_set(&u->refcnt, 1); > + > + if (unlikely(kevent_user_ring_init(u))) { > + kfree(u); > + return -ENOMEM; > + } > + > + file->private_data = u; > + return 0; > +} > + > +/* > + * Kevent userspace control block reference counting. > + * Set to 1 at creation time, when appropriate kevent file descriptor > + * is closed, that reference counter is decreased. > + * When counter hits zero block is freed. > + */ > +static inline void kevent_user_get(struct kevent_user *u) > +{ > + atomic_inc(&u->refcnt); > +} > + > +static inline void kevent_user_put(struct kevent_user *u) > +{ > + if (atomic_dec_and_test(&u->refcnt)) { > + kevent_stat_print(u); > + kevent_user_ring_fini(u); > + kfree(u); > + } > +} > + > +static struct page *kevent_user_nopage(struct vm_area_struct *vma, unsigned long addr, int *type) > +{ > + struct kevent_user *u = vma->vm_file->private_data; > + unsigned long off = (addr - vma->vm_start)/PAGE_SIZE; > + > + if (type) > + *type = VM_FAULT_MINOR; > + > + if (off >= u->pages_in_use) > + goto err_out_sigbus; > + > + return virt_to_page(u->pring[off]); > + > +err_out_sigbus: > + return NOPAGE_SIGBUS; > +} > + > +static struct vm_operations_struct kevent_user_vm_ops = { > + .nopage = &kevent_user_nopage, > +}; > + > +/* > + * Mmap implementation for ring buffer, which is created as array > + * of pages, so vm_pgoff is an offset (in pages, not in bytes) of > + * the first page to be mapped. > + */ > +static int kevent_user_mmap(struct file *file, struct vm_area_struct *vma) > +{ > + unsigned long start = vma->vm_start; > + struct kevent_user *u = file->private_data; > + > + if (vma->vm_flags & VM_WRITE) > + return -EPERM; > + > + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); > + vma->vm_ops = &kevent_user_vm_ops; > + vma->vm_flags |= VM_RESERVED; > + vma->vm_file = file; > + > + if (vm_insert_page(vma, start, virt_to_page(u->pring[0]))) > + return -EFAULT; > + > + return 0; > +} > + > +static inline unsigned int __kevent_user_hash(struct kevent_id *id) > +{ > + return jhash_1word(id->raw[0], 0) & KEVENT_HASH_MASK; > +} > + > +static inline unsigned int kevent_user_hash(struct ukevent *uk) > +{ > + return __kevent_user_hash(&uk->id); > +} > + > +/* > + * RCU protects storage list (kevent->storage_entry). > + * Free entry in RCU callback, it is dequeued from all lists at > + * this point. > + */ > + > +static void kevent_free_rcu(struct rcu_head *rcu) > +{ > + struct kevent *kevent = container_of(rcu, struct kevent, rcu_head); > + kmem_cache_free(kevent_cache, kevent); > +} > + > +/* > + * Complete kevent removing - it dequeues kevent from storage list > + * if it is requested, removes kevent from ready list, drops userspace > + * control block reference counter and schedules kevent freeing through RCU. > + */ > +static void kevent_finish_user_complete(struct kevent *k, int deq) > +{ > + struct kevent_user *u = k->user; > + unsigned long flags; > + > + if (deq) > + kevent_dequeue(k); > + > + spin_lock_irqsave(&u->ready_lock, flags); > + if (k->flags & KEVENT_READY) { > + list_del(&k->ready_entry); > + k->flags &= ~KEVENT_READY; > + u->ready_num--; > + } > + spin_unlock_irqrestore(&u->ready_lock, flags); > + > + kevent_user_put(u); > + call_rcu(&k->rcu_head, kevent_free_rcu); > +} > + > +/* > + * Remove from all lists and free kevent. > + * Must be called under kevent_user->kevent_lock to protect > + * kevent->kevent_entry removing. > + */ > +static void __kevent_finish_user(struct kevent *k, int deq) > +{ > + struct kevent_user *u = k->user; > + > + list_del(&k->kevent_entry); > + k->flags &= ~KEVENT_USER; > + u->kevent_num--; > + kevent_finish_user_complete(k, deq); > +} > + > +/* > + * Remove kevent from user's list of all events, > + * dequeue it from storage and decrease user's reference counter, > + * since this kevent does not exist anymore. That is why it is freed here. > + */ > +static void kevent_finish_user(struct kevent *k, int deq) > +{ > + struct kevent_user *u = k->user; > + unsigned long flags; > + > + spin_lock_irqsave(&u->kevent_lock, flags); > + list_del(&k->kevent_entry); > + k->flags &= ~KEVENT_USER; > + u->kevent_num--; > + spin_unlock_irqrestore(&u->kevent_lock, flags); > + kevent_finish_user_complete(k, deq); > +} > + > +/* > + * Dequeue one entry from user's ready queue. > + */ > +static struct kevent *kqueue_dequeue_ready(struct kevent_user *u) > +{ > + unsigned long flags; > + struct kevent *k = NULL; > + > + spin_lock_irqsave(&u->ready_lock, flags); > + if (u->ready_num && !list_empty(&u->ready_list)) { > + k = list_entry(u->ready_list.next, struct kevent, ready_entry); > + list_del(&k->ready_entry); > + k->flags &= ~KEVENT_READY; > + u->ready_num--; > + } > + spin_unlock_irqrestore(&u->ready_lock, flags); > + > + return k; > +} > + > +/* > + * Search a kevent inside hash bucket for given ukevent. > + */ > +static struct kevent *__kevent_search(struct list_head *head, struct ukevent *uk, > + struct kevent_user *u) > +{ > + struct kevent *k, *ret = NULL; > + > + list_for_each_entry(k, head, kevent_entry) { > + if (k->event.user[0] == uk->user[0] && k->event.user[1] == uk->user[1] && > + k->event.id.raw[0] == uk->id.raw[0] && > + k->event.id.raw[1] == uk->id.raw[1]) { > + ret = k; > + break; > + } > + } > + > + return ret; > +} > + > +/* > + * Search and modify kevent according to provided ukevent. > + */ > +static int kevent_modify(struct ukevent *uk, struct kevent_user *u) > +{ > + struct kevent *k; > + unsigned int hash = kevent_user_hash(uk); > + int err = -ENODEV; > + unsigned long flags; > + > + spin_lock_irqsave(&u->kevent_lock, flags); > + k = __kevent_search(&u->kevent_list[hash], uk, u); > + if (k) { > + spin_lock(&k->ulock); > + k->event.event = uk->event; > + k->event.req_flags = uk->req_flags; > + k->event.ret_flags = 0; > + spin_unlock(&k->ulock); > + kevent_requeue(k); > + err = 0; > + } > + spin_unlock_irqrestore(&u->kevent_lock, flags); > + > + return err; > +} > + > +/* > + * Remove kevent which matches provided ukevent. > + */ > +static int kevent_remove(struct ukevent *uk, struct kevent_user *u) > +{ > + int err = -ENODEV; > + struct kevent *k; > + unsigned int hash = kevent_user_hash(uk); > + unsigned long flags; > + > + spin_lock_irqsave(&u->kevent_lock, flags); > + k = __kevent_search(&u->kevent_list[hash], uk, u); > + if (k) { > + __kevent_finish_user(k, 1); > + err = 0; > + } > + spin_unlock_irqrestore(&u->kevent_lock, flags); > + > + return err; > +} > + > +/* > + * Detaches userspace control block from file descriptor > + * and decrease it's reference counter. > + * No new kevents can be added or removed from any list at this point. > + */ > +static int kevent_user_release(struct inode *inode, struct file *file) > +{ > + struct kevent_user *u = file->private_data; > + struct kevent *k, *n; > + int i; > + > + for (i = 0; i < ARRAY_SIZE(u->kevent_list); ++i) { > + list_for_each_entry_safe(k, n, &u->kevent_list[i], kevent_entry) > + kevent_finish_user(k, 1); > + } > + > + kevent_user_put(u); > + file->private_data = NULL; > + > + return 0; > +} > + > +/* > + * Read requested number of ukevents in one shot. > + */ > +static struct ukevent *kevent_get_user(unsigned int num, void __user *arg) > +{ > + struct ukevent *ukev; > + > + ukev = kmalloc(sizeof(struct ukevent) * num, GFP_KERNEL); > + if (!ukev) > + return NULL; > + > + if (copy_from_user(ukev, arg, sizeof(struct ukevent) * num)) { > + kfree(ukev); > + return NULL; > + } > + > + return ukev; > +} > + > +/* > + * Read from userspace all ukevents and modify appropriate kevents. > + * If provided number of ukevents is more that threshold, it is faster > + * to allocate a room for them and copy in one shot instead of copy > + * one-by-one and then process them. > + */ > +static int kevent_user_ctl_modify(struct kevent_user *u, unsigned int num, void __user *arg) > +{ > + int err = 0, i; > + struct ukevent uk; > + > + mutex_lock(&u->ctl_mutex); > + > + if (num > u->kevent_num) { > + err = -EINVAL; > + goto out; > + } > + > + if (num > KEVENT_MIN_BUFFS_ALLOC) { > + struct ukevent *ukev; > + > + ukev = kevent_get_user(num, arg); > + if (ukev) { > + for (i = 0; i < num; ++i) { > + if (kevent_modify(&ukev[i], u)) > + ukev[i].ret_flags |= KEVENT_RET_BROKEN; > + ukev[i].ret_flags |= KEVENT_RET_DONE; > + } > + if (copy_to_user(arg, ukev, num*sizeof(struct ukevent))) > + err = -EFAULT; > + kfree(ukev); > + goto out; > + } > + } > + > + for (i = 0; i < num; ++i) { > + if (copy_from_user(&uk, arg, sizeof(struct ukevent))) { > + err = -EFAULT; > + break; > + } > + > + if (kevent_modify(&uk, u)) > + uk.ret_flags |= KEVENT_RET_BROKEN; > + uk.ret_flags |= KEVENT_RET_DONE; > + > + if (copy_to_user(arg, &uk, sizeof(struct ukevent))) { > + err = -EFAULT; > + break; > + } > + > + arg += sizeof(struct ukevent); > + } > +out: > + mutex_unlock(&u->ctl_mutex); > + > + return err; > +} > + > +/* > + * Read from userspace all ukevents and remove appropriate kevents. > + * If provided number of ukevents is more that threshold, it is faster > + * to allocate a room for them and copy in one shot instead of copy > + * one-by-one and then process them. > + */ > +static int kevent_user_ctl_remove(struct kevent_user *u, unsigned int num, void __user *arg) > +{ > + int err = 0, i; > + struct ukevent uk; > + > + mutex_lock(&u->ctl_mutex); > + > + if (num > u->kevent_num) { > + err = -EINVAL; > + goto out; > + } > + > + if (num > KEVENT_MIN_BUFFS_ALLOC) { > + struct ukevent *ukev; > + > + ukev = kevent_get_user(num, arg); > + if (ukev) { > + for (i = 0; i < num; ++i) { > + if (kevent_remove(&ukev[i], u)) > + ukev[i].ret_flags |= KEVENT_RET_BROKEN; > + ukev[i].ret_flags |= KEVENT_RET_DONE; > + } > + if (copy_to_user(arg, ukev, num*sizeof(struct ukevent))) > + err = -EFAULT; > + kfree(ukev); > + goto out; > + } > + } > + > + for (i = 0; i < num; ++i) { > + if (copy_from_user(&uk, arg, sizeof(struct ukevent))) { > + err = -EFAULT; > + break; > + } > + > + if (kevent_remove(&uk, u)) > + uk.ret_flags |= KEVENT_RET_BROKEN; > + > + uk.ret_flags |= KEVENT_RET_DONE; > + > + if (copy_to_user(arg, &uk, sizeof(struct ukevent))) { > + err = -EFAULT; > + break; > + } > + > + arg += sizeof(struct ukevent); > + } > +out: > + mutex_unlock(&u->ctl_mutex); > + > + return err; > +} > + > +/* > + * Queue kevent into userspace control block and increase > + * it's reference counter. > + */ > +static void kevent_user_enqueue(struct kevent_user *u, struct kevent *k) > +{ > + unsigned long flags; > + unsigned int hash = kevent_user_hash(&k->event); > + > + spin_lock_irqsave(&u->kevent_lock, flags); > + list_add_tail(&k->kevent_entry, &u->kevent_list[hash]); > + k->flags |= KEVENT_USER; > + u->kevent_num++; > + kevent_user_get(u); > + spin_unlock_irqrestore(&u->kevent_lock, flags); > +} > + > +/* > + * Add kevent from both kernel and userspace users. > + * This function allocates and queues kevent, returns negative value > + * on error, positive if kevent is ready immediately and zero > + * if kevent has been queued. > + */ > +int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u) > +{ > + struct kevent *k; > + int err; > + > + if (kevent_user_ring_grow(u)) { > + err = -ENOMEM; > + goto err_out_exit; > + } > + > + k = kmem_cache_alloc(kevent_cache, GFP_KERNEL); > + if (!k) { > + err = -ENOMEM; > + goto err_out_exit; > + } > + > + memcpy(&k->event, uk, sizeof(struct ukevent)); > + INIT_RCU_HEAD(&k->rcu_head); > + > + k->event.ret_flags = 0; > + > + err = kevent_init(k); > + if (err) { > + kmem_cache_free(kevent_cache, k); > + goto err_out_exit; > + } > + k->user = u; > + kevent_stat_total(u); > + kevent_user_enqueue(u, k); > + > + err = kevent_enqueue(k); > + if (err) { > + memcpy(uk, &k->event, sizeof(struct ukevent)); > + kevent_finish_user(k, 0); > + goto err_out_exit; > + } > + > + return 0; > + > +err_out_exit: > + if (err < 0) { > + uk->ret_flags |= KEVENT_RET_BROKEN | KEVENT_RET_DONE; > + uk->ret_data[1] = err; > + } else if (err > 0) > + uk->ret_flags |= KEVENT_RET_DONE; > + return err; > +} > + > +/* > + * Copy all ukevents from userspace, allocate kevent for each one > + * and add them into appropriate kevent_storages, > + * e.g. sockets, inodes and so on... > + * Ready events will replace ones provided by used and number > + * of ready events is returned. > + * User must check ret_flags field of each ukevent structure > + * to determine if it is fired or failed event. > + */ > +static int kevent_user_ctl_add(struct kevent_user *u, unsigned int num, void __user *arg) > +{ > + int err, cerr = 0, knum = 0, rnum = 0, i; > + void __user *orig = arg; > + struct ukevent uk; > + > + mutex_lock(&u->ctl_mutex); > + > + err = -EINVAL; > + if (u->kevent_num + num >= KEVENT_MAX_EVENTS) > + goto out_remove; > + > + if (num > KEVENT_MIN_BUFFS_ALLOC) { > + struct ukevent *ukev; > + > + ukev = kevent_get_user(num, arg); > + if (ukev) { > + for (i = 0; i < num; ++i) { > + err = kevent_user_add_ukevent(&ukev[i], u); > + if (err) { > + kevent_stat_im(u); > + if (i != rnum) > + memcpy(&ukev[rnum], &ukev[i], sizeof(struct ukevent)); > + rnum++; > + } else > + knum++; > + } > + if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent))) > + cerr = -EFAULT; > + kfree(ukev); > + goto out_setup; > + } > + } > + > + for (i = 0; i < num; ++i) { > + if (copy_from_user(&uk, arg, sizeof(struct ukevent))) { > + cerr = -EFAULT; > + break; > + } > + arg += sizeof(struct ukevent); > + > + err = kevent_user_add_ukevent(&uk, u); > + if (err) { > + kevent_stat_im(u); > + if (copy_to_user(orig, &uk, sizeof(struct ukevent))) { > + cerr = -EFAULT; > + break; > + } > + orig += sizeof(struct ukevent); > + rnum++; > + } else > + knum++; > + } > + > +out_setup: > + if (cerr < 0) { > + err = cerr; > + goto out_remove; > + } > + > + err = rnum; > +out_remove: > + mutex_unlock(&u->ctl_mutex); > + > + return err; > +} > + > +/* > + * In nonblocking mode it returns as many events as possible, but not more than @max_nr. > + * In blocking mode it waits until timeout or if at least @min_nr events are ready. > + */ > +static int kevent_user_wait(struct file *file, struct kevent_user *u, > + unsigned int min_nr, unsigned int max_nr, __u64 timeout, > + void __user *buf) > +{ > + struct kevent *k; > + int num = 0; > + > + if (!(file->f_flags & O_NONBLOCK)) { > + wait_event_interruptible_timeout(u->wait, > + u->ready_num >= min_nr, > + clock_t_to_jiffies(nsec_to_clock_t(timeout))); > + } > + > + while (num < max_nr && ((k = kqueue_dequeue_ready(u)) != NULL)) { > + if (copy_to_user(buf + num*sizeof(struct ukevent), > + &k->event, sizeof(struct ukevent))) > + break; > + > + /* > + * If it is one-shot kevent, it has been removed already from > + * origin's queue, so we can easily free it here. > + */ > + if (k->event.req_flags & KEVENT_REQ_ONESHOT) > + kevent_finish_user(k, 1); > + ++num; > + kevent_stat_wait(u); > + } > + > + return num; > +} > + > +static struct file_operations kevent_user_fops = { > + .mmap = kevent_user_mmap, > + .open = kevent_user_open, > + .release = kevent_user_release, > + .poll = kevent_user_poll, > + .owner = THIS_MODULE, > +}; const? > + > +static struct miscdevice kevent_miscdev = { > + .minor = MISC_DYNAMIC_MINOR, > + .name = kevent_name, > + .fops = &kevent_user_fops, > +}; const? > + > +static int kevent_ctl_process(struct file *file, unsigned int cmd, unsigned int num, void __user *arg) > +{ > + int err; > + struct kevent_user *u = file->private_data; > + > + if (!u || num > KEVENT_MAX_EVENTS) > + return -EINVAL; > + > + switch (cmd) { > + case KEVENT_CTL_ADD: > + err = kevent_user_ctl_add(u, num, arg); > + break; > + case KEVENT_CTL_REMOVE: > + err = kevent_user_ctl_remove(u, num, arg); > + break; > + case KEVENT_CTL_MODIFY: > + err = kevent_user_ctl_modify(u, num, arg); > + break; > + default: > + err = -EINVAL; > + break; > + } > + > + return err; > +} > + > +/* > + * Used to get ready kevents from queue. > + * @ctl_fd - kevent control descriptor which must be obtained through kevent_ctl(KEVENT_CTL_INIT). Isn't this obtained through open(chardev) now? > + * @min_nr - minimum number of ready kevents. > + * @max_nr - maximum number of ready kevents. > + * @timeout - timeout in nanoseconds to wait until some events are ready. > + * @buf - buffer to place ready events. > + * @flags - ununsed for now (will be used for mmap implementation). There is currently an mmap implementation. Is flags still used? > + */ > +asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr, > + __u64 timeout, struct ukevent __user *buf, unsigned flags) > +{ > + int err = -EINVAL; > + struct file *file; > + struct kevent_user *u; > + > + file = fget(ctl_fd); > + if (!file) > + return -ENODEV; > + > + if (file->f_op != &kevent_user_fops) > + goto out_fput; > + u = file->private_data; > + > + err = kevent_user_wait(file, u, min_nr, max_nr, timeout, buf); > +out_fput: > + fput(file); > + return err; > +} > + > +/* > + * This syscall is used to perform waiting until there is free space in kevent queue > + * and removes all ready kevents until and including @index. > + * @ctl_fd - kevent file descriptor. > + * @start - start index of the processed by userspace kevents. > + * @num - number of processed kevents. > + * @timeout - this timeout specifies number of nanoseconds to wait until there is > + * free space in kevent queue. > + */ > +asmlinkage long sys_kevent_wait(int ctl_fd, unsigned int start, unsigned int num, __u64 timeout) > +{ > + int err = -EINVAL, found; > + struct file *file; > + struct kevent_user *u; > + struct kevent *k, *n; > + struct mukevent *muk; > + unsigned int idx, off, hash; > + unsigned long flags; > + > + if (start + num >= KEVENT_MAX_EVENTS || > + start >= KEVENT_MAX_EVENTS || > + num >= KEVENT_MAX_EVENTS) Since start and num are unsigned, the last two checks are redundant. If start or num is individually >= KEVENT_MAX_EVENTS, start + num must be. > + return -EINVAL; > + > + file = fget(ctl_fd); > + if (!file) > + return -ENODEV; > + > + if (file->f_op != &kevent_user_fops) > + goto out_fput; > + u = file->private_data; > + > + if (((start + num) / KEVENTS_ON_PAGE) >= u->pages_in_use || > + (start / KEVENTS_ON_PAGE) >= u->pages_in_use) > + goto out_fput; > + > + spin_lock_irqsave(&u->kevent_lock, flags); > + while (num > 0) { > + idx = start / KEVENTS_ON_PAGE; > + off = start % KEVENTS_ON_PAGE; > + > + muk = &u->pring[idx]->event[off]; > + hash = __kevent_user_hash(&muk->id); > + found = 0; > + list_for_each_entry_safe(k, n, &u->kevent_list[hash], kevent_entry) { > + if ((k->event.id.raw[0] == muk->id.raw[0]) && (k->event.id.raw[1] == muk->id.raw[1])) { > + /* > + * Optimization for case when there is only one rearming kevent and > + * userspace is buggy enough and sets start index to zero. > + */ > + if (k->flags & KEVENT_READY) { > + spin_lock(&u->ready_lock); > + if (k->flags & KEVENT_READY) { > + list_del(&k->ready_entry); > + k->flags &= ~KEVENT_READY; > + u->ready_num--; > + } > + spin_unlock(&u->ready_lock); > + } > + > + if (k->event.req_flags & KEVENT_REQ_ONESHOT) > + __kevent_finish_user(k, 1); > + found = 1; > + > + break; > + } > + } > + > + if (!found) { > + spin_unlock_irqrestore(&u->kevent_lock, flags); > + goto out_fput; > + } > + > + if (++start >= KEVENT_MAX_EVENTS) > + start = 0; > + num--; > + } > + spin_unlock_irqrestore(&u->kevent_lock, flags); > + > + if (!(file->f_flags & O_NONBLOCK)) { > + wait_event_interruptible_timeout(u->wait, > + u->ready_num >= 1, > + clock_t_to_jiffies(nsec_to_clock_t(timeout))); > + } > + > + fput(file); > + > + return (u->ready_num >= 1)?0:-EAGAIN; > +out_fput: > + fput(file); > + return err; > +} > + > +/* > + * This syscall is used to perform various control operations > + * on given kevent queue, which is obtained through kevent file descriptor @fd. > + * @cmd - type of operation. > + * @num - number of kevents to be processed. > + * @arg - pointer to array of struct ukevent. > + */ > +asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, struct ukevent __user *arg) > +{ > + int err = -EINVAL; > + struct file *file; > + > + file = fget(fd); > + if (!file) > + return -ENODEV; > + > + if (file->f_op != &kevent_user_fops) > + goto out_fput; > + > + err = kevent_ctl_process(file, cmd, num, arg); > + > +out_fput: > + fput(file); > + return err; > +} > + > +/* > + * Kevent subsystem initialization - create kevent cache and register > + * filesystem to get control file descriptors from. > + */ > +static int __devinit kevent_user_init(void) > +{ > + int err = 0; > + > + kevent_cache = kmem_cache_create("kevent_cache", > + sizeof(struct kevent), 0, SLAB_PANIC, NULL, NULL); > + > + err = misc_register(&kevent_miscdev); > + if (err) { > + printk(KERN_ERR "Failed to register kevent miscdev: err=%d.\n", err); Are we leaving the slab cache for kevents around in this failure case for a reason? Why is a slab creation failure panic()-worthy fatal when a chardev allocation failure is not? > + goto err_out_exit; > + } > + > + printk("KEVENT subsystem has been successfully registered.\n"); > + > + return 0; > + > +err_out_exit: > + return err; > +} > + > +static void __devexit kevent_user_fini(void) > +{ > + misc_deregister(&kevent_miscdev); > +} > + > +module_init(kevent_user_init); > +module_exit(kevent_user_fini); > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c > index 6991bec..564e618 100644 > --- a/kernel/sys_ni.c > +++ b/kernel/sys_ni.c > @@ -122,6 +122,10 @@ cond_syscall(ppc_rtas); > cond_syscall(sys_spu_run); > cond_syscall(sys_spu_create); > > +cond_syscall(sys_kevent_get_events); > +cond_syscall(sys_kevent_wait); > +cond_syscall(sys_kevent_ctl); > + > /* mmu depending weak syscall entries */ > cond_syscall(sys_mprotect); > cond_syscall(sys_msync); > > Looking pretty good. This is my first pass of comments, and I'll probably have questions that follow, but I'm trying to get a really good picture of what is going on here for documentation purposes. Thanks, Chase - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/