Hello!
Currently, on the x86_64 architecture, its quite tricky to make
a char device ioctl work for an x86 executables.
In particular,
1. there is a requirement that ioctl number is unique -
which is hard to guarantee especially for out of kernel modules
2. there's a performance huge overhead for each compat call - there's
a hash lookup in a global hash inside a lock_kernel -
and I think compat performance *is* important.
Further, adding a command to the ioctl suddenly requires changing
two places - registration code and ioctl itself.
However, if all ioctl commands for a specific device
are not passing around pointers and long longs,
(relatively common case), and sometimes even
if they are, this ioctl is 64/32 bit compatible -
so why isnt there a simple way to declare this fact?
Here's a patch that simply adds a flag field to f_ops
that will cause all ioctls to get forwarded directly to the
64 bit call.
If all ioctls are compatible for a character device, a flag just
has to be set there, before the device is registered.
MST
diff -ruw linux-2.6.8.1/fs/compat.c linux-2.6.8.1-built/fs/compat.c
--- linux-2.6.8.1/fs/compat.c 2004-08-14 13:55:31.000000000 +0300
+++ linux-2.6.8.1-built/fs/compat.c 2004-09-01 09:52:10.126944256 +0300
@@ -392,7 +392,8 @@
if(!filp)
goto out2;
- if (!filp->f_op || !filp->f_op->ioctl) {
+ if (!filp->f_op || !filp->f_op->ioctl ||
+ (filp->f_op->fops_flags & FOPS_IOCTL_COMPAT)) {
error = sys_ioctl (fd, cmd, arg);
goto out;
}
diff -ruw linux-2.6.8.1/include/linux/fs.h linux-2.6.8.1-built/include/linux/fs.h
--- linux-2.6.8.1/include/linux/fs.h 2004-08-14 13:55:09.000000000 +0300
+++ linux-2.6.8.1-built/include/linux/fs.h 2004-09-01 09:50:07.265622016 +0300
@@ -871,6 +871,7 @@
*/
struct file_operations {
struct module *owner;
+ unsigned long fops_flags; /* Flags, listed below. */
loff_t (*llseek) (struct file *, loff_t, int);
ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
ssize_t (*aio_read) (struct kiocb *, char __user *, size_t, loff_t);
@@ -896,6 +897,8 @@
int (*dir_notify)(struct file *filp, unsigned long arg);
};
+#define FOPS_IOCTL_COMPAT 0x00000001 /* ioctl is 32/64 bit compatible */
+
struct inode_operations {
int (*create) (struct inode *,struct dentry *,int, struct nameidata *);
struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *);
On Wed, Sep 01, 2004 at 10:22:45AM +0300, Michael S. Tsirkin wrote:
> Hello!
> Currently, on the x86_64 architecture, its quite tricky to make
> a char device ioctl work for an x86 executables.
> In particular,
> 1. there is a requirement that ioctl number is unique -
> which is hard to guarantee especially for out of kernel modules
Too bad.
> 2. there's a performance huge overhead for each compat call - there's
> a hash lookup in a global hash inside a lock_kernel -
> and I think compat performance *is* important.
>
> Further, adding a command to the ioctl suddenly requires changing
> two places - registration code and ioctl itself.
So don't add them. Adding a new ioctl is *NOT* a step to be taken lightly -
we used to be far too accepting in that area and as somebody who'd waded
through the resulting dungpiles over the last months I can tell you that
result is utterly revolting.
Excuse me, but I have zero sympathy to people who complain about obstacles
to dumping more into the same piles - it should be hard.
Hello!
Please do not mail me directly.
Quoting [email protected] ([email protected]) "Re: f_ops flag to speed up compatible ioctls in linux kernel":
> On Wed, Sep 01, 2004 at 10:22:45AM +0300, Michael S. Tsirkin wrote:
> > 2. there's a performance huge overhead for each compat call - there's
> > a hash lookup in a global hash inside a lock_kernel -
> > and I think compat performance *is* important.
> >
> > Further, adding a command to the ioctl suddenly requires changing
> > two places - registration code and ioctl itself.
>
> So don't add them. Adding a new ioctl is *NOT* a step to be taken lightly -
> we used to be far too accepting in that area and as somebody who'd waded
> through the resulting dungpiles over the last months I can tell you that
> result is utterly revolting.
Why make it a bigger dungpile then?
And what about the performance overhead?
MST
On Wed, 2004-09-01 at 03:32, [email protected]
wrote:
> On Wed, Sep 01, 2004 at 10:22:45AM +0300, Michael S. Tsirkin wrote:
> > Hello!
> > Currently, on the x86_64 architecture, its quite tricky to make
> > a char device ioctl work for an x86 executables.
> > In particular,
> > 1. there is a requirement that ioctl number is unique -
> > which is hard to guarantee especially for out of kernel modules
>
> Too bad.
>
> > 2. there's a performance huge overhead for each compat call - there's
> > a hash lookup in a global hash inside a lock_kernel -
> > and I think compat performance *is* important.
> >
> > Further, adding a command to the ioctl suddenly requires changing
> > two places - registration code and ioctl itself.
>
> So don't add them. Adding a new ioctl is *NOT* a step to be taken lightly -
> we used to be far too accepting in that area and as somebody who'd waded
> through the resulting dungpiles over the last months I can tell you that
> result is utterly revolting.
>
By adding a new ioctl you are adding a new use of the BKL. It has been
suggested on dri-devel that this should be fixed. Is this even
possible?
Lee
Quoting Lee Revell ([email protected]) "Re: f_ops flag to speed up compatible ioctls in linux kernel":
> By adding a new ioctl you are adding a new use of the BKL. It has been
> suggested on dri-devel that this should be fixed. Is this even
> possible?
>
> Lee
I dont know - can the lock be released before the call to
filp->f_op->ioctl ?
I assume the reason its there is for legacy code - existing ioctls
may be assuming the BKL is taken, but maybe there could be another
flag in f_ops to let sys_ioctl release the lock before doing the call ...
Like this - would that be safe?
--- linux-2.6.8.1/fs/ioctl.c 2004-08-14 13:54:51.000000000 +0300
+++ linux-2.6.8.1-built/fs/ioctl.c 2004-09-01 11:14:59.944417160 +0300
@@ -126,6 +126,14 @@
error = -ENOTTY;
if (S_ISREG(filp->f_dentry->d_inode->i_mode))
error = file_ioctl(filp, cmd, arg);
+ else if (filp->f_op && filp->f_op->ioctl &&
+ (filp->f_op->fops_flags & FOPS_IOCTL_NOLOCK)) {
+ {
+ unlock_kernel();
+ error = filp->f_op->ioctl(filp->f_dentry->d_inode, filp, cmd, arg);
+ goto out;
+
+ }
else if (filp->f_op && filp->f_op->ioctl)
error = filp->f_op->ioctl(filp->f_dentry->d_inode, filp, cmd, arg);
}
On Wed, 2004-09-01 at 09:22, Michael S. Tsirkin wrote:
> Hello!
> Currently, on the x86_64 architecture, its quite tricky to make
> a char device ioctl work for an x86 executables.
> In particular,
> 1. there is a requirement that ioctl number is unique -
> which is hard to guarantee especially for out of kernel modules
well... external modules thus should be really really careful with
ioctls then and not embrace and extend too much but just use existing
ones instead when reasonable
> 2. there's a performance huge overhead for each compat call - there's
> a hash lookup in a global hash inside a lock_kernel -
> and I think compat performance *is* important.
such is life
>
> Further, adding a command to the ioctl suddenly requires changing
> two places - registration code and ioctl itself.
adding ioctls SHOULD be painful. Really painful. It's similar to adding
syscalls; you'll have to keep compatibility basically forever so adding
should not be an easy thing.
Hi!
Stop being arrogant.
Can you please elaborate on how to make Linux kernel support e.g. motion
controllers? They do not fit *any* known to me driver interface. They have
several axes, they have about twenty parameters (float or integer), and they
have several commands, a-la start, graceful stop, abrupt stop. Plus
obviously diagnostics - about ten another commands with absolutely different
parameters. And about ten motion monitoring commands. And this is one
example I were need to program.
Or take any other freaky device we might got on market tomorrow.
As ioctl() opponent, be kind and give some advice what to do in that kind
of situations.
Al Viro written:
>> Hello!
>> Currently, on the x86_64 architecture, its quite tricky to make
>> a char device ioctl work for an x86 executables.
>> In particular,
>> 1. there is a requirement that ioctl number is unique -
>> which is hard to guarantee especially for out of kernel modules
>
>Too bad.
>
>> 2. there's a performance huge overhead for each compat call - there's
>> a hash lookup in a global hash inside a lock_kernel -
>> and I think compat performance *is* important.
>>
>> Further, adding a command to the ioctl suddenly requires changing
>> two places - registration code and ioctl itself.
>
> So don't add them. Adding a new ioctl is *NOT* a step to be taken lightly -
> we used to be far too accepting in that area and as somebody who'd waded
> through the resulting dungpiles over the last months I can tell you that
> result is utterly revolting.
>
> Excuse me, but I have zero sympathy to people who complain about obstacles
> to dumping more into the same piles - it should be hard.
Arjan van de Ven written:
>> Further, adding a command to the ioctl suddenly requires changing
>> two places - registration code and ioctl itself.
>
>adding ioctls SHOULD be painful. Really painful. It's similar to adding
>syscalls; you'll have to keep compatibility basically forever so adding
>should not be an easy thing.
--- with respect. best regards.
*** Philips @ Saarbruecken.
On Wed, Sep 01, 2004 at 03:50:11AM -0600, [email protected] wrote:
> Hi!
>
> Stop being arrogant.
> Can you please elaborate on how to make Linux kernel support e.g. motion
> controllers? They do not fit *any* known to me driver interface. They have
> several axes, they have about twenty parameters (float or integer), and
parameters nicely fit in sysfs.
> they have several commands, a-la start, graceful stop, abrupt stop. Plus
> obviously diagnostics - about ten another commands with absolutely
> different parameters. And about ten motion monitoring commands. And this is
> one example I were need to program.
a write() interface doesn't work???
Hard to believe, you even call them commands.
fd = open("/dev/funkydevice");
write(fd, "start");
doesn't sound insane to me
Arjan van de Ven writes:
> On Wed, Sep 01, 2004 at 03:50:11AM -0600, [email protected] wrote:
>> Hi!
>>
>> Stop being arrogant.
>> Can you please elaborate on how to make Linux kernel support e.g. motion
>> controllers? They do not fit *any* known to me driver interface. They have
>> several axes, they have about twenty parameters (float or integer), and
>
> parameters nicely fit in sysfs.
>
What about errors?
"set di 200000" might fail for lots of reasons.
>> they have several commands, a-la start, graceful stop, abrupt stop. Plus
>> obviously diagnostics - about ten another commands with absolutely
>> different parameters. And about ten motion monitoring commands. And this is
>> one example I were need to program.
>
> a write() interface doesn't work???
> Hard to believe, you even call them commands.
> fd = open("/dev/funkydevice");
> write(fd, "start");
>
> doesn't sound insane to me
>
it doesn't, since you didn't tryed to make error handling. every thing can
fail - this is control of mechanics - and it fails often and for a lot of
reasons. Put here error handling (write(struct whatever) has to return
another struct whatever2 filled with error description) and thread-safeness.
Pluss some controllers do support multi-dimensional movements "start
x,y,z,etc" - and you might have _several_ error structs. Atomicity is
important for multi-dimensional moves too - move on given axes starts with
single command.
hu?
I do not see much point in renaming ioctl() to write() all over the place -
at least when people see ioctl() they understand that it is not standard
functionality. write() will for sure confuse a lot of people.
--- with respect. best regards.
*** Philips @ Saarbruecken.
Well. I given an example of device which doesn't fit into
read()/write() interface.
Your imagination is truely appreciated - and you are encouraged to
try to implement it. With full error handling (every write must be
checked) and multithreading support (e.g. one user for movement,
multiple for monitoring and diagnosis).
I have tryed to some extent impelementing something like this -
amount of code doubled for no real gain. (It was in those times when
"ioctl()s are bad. period." topic poped up on lkml first time. long time
ago)
No one will ever design interface for sake of interface. what you are
proposing - probably will look nice in academical papers, but no one
will do it, and no one will do it in kernel space.
And all this stuff - all this funcy xml-like comlications? - for
what? just call one function of driver with single parameter - void*.
_*Not more*_. Ridiculous indeed.
P.S. As I have said before, it seems to me that using read()/write()
instead of ioctl() could be the only choice. I once hacked read(): user
must pass two structures: first is input, second is output. It worked -
but no one (including me) liked it. ioctl() even by its name fit better.
P.P.S. Hm. Why not implement ioctl2()? which will be linked directly to
device by its driver? - numbering will be internal to driver, and
provide entry point into driver for user space applications. No one
likes mess with ioctl()s in Linux - no device driver developers, nor
users. But what is really needed - is just call into driver. Paramenter
- single pointer have being proved to be sufficient.
Helge Hafting wrote:
> [email protected] wrote:
>
>> Arjan van de Ven writes:
>>
>>> On Wed, Sep 01, 2004 at 03:50:11AM -0600, [email protected] wrote:
>>>
>>>> Hi! Stop being arrogant.
>>>> Can you please elaborate on how to make Linux kernel support e.g.
>>>> motion controllers? They do not fit *any* known to me driver
>>>> interface. They have several axes, they have about twenty parameters
>>>> (float or integer), and
>>>
>>>
>>>
>>> parameters nicely fit in sysfs.
>>
>>
>>
>> What about errors?
>> "set di 200000" might fail for lots of reasons.
>>
>>>> they have several commands, a-la start, graceful stop, abrupt stop.
>>>> Plus obviously diagnostics - about ten another commands with
>>>> absolutely different parameters. And about ten motion monitoring
>>>> commands. And this is one example I were need to program.
>>>
>>>
>>>
>>> a write() interface doesn't work???
>>> Hard to believe, you even call them commands.
>>> fd = open("/dev/funkydevice");
>>> write(fd, "start");
>>> doesn't sound insane to me
>>
>>
>>
>> it doesn't, since you didn't tryed to make error handling. every thing
>> can fail - this is control of mechanics - and it fails often and for a
>> lot of reasons. Put here error handling (write(struct whatever)
>
>
> Well, how about a read-write interface?
> Write a command to the driver, read back the response. If the response
> indicate
> an error, read more to get error details. Keep reading out possibly
> several errors
> until the driver have nothing more to report.
>
>> has to return another struct whatever2 filled with error description)
>> and thread-safeness. Pluss some controllers do support
>> multi-dimensional movements "start x,y,z,etc" - and you might have
>> _several_ error structs. Atomicity is important for multi-dimensional
>> moves too - move on given axes
>
>
> The driver can handle atomicity - if it somehow receives a partial
> command it should
> buffer it and wait for the rest. If you need several commands for a
> single movement,
> consider barriers, i.e.:
> write(fd, "barrier start");
> write(fd, "one kind of movement");
> write(fd, "other kind of movement");
> write(fd, "third kind of movement");
> write(fd, "barrier end"); /* Complex movement starts at this point */
> read(fd, &error_buffer, size); /* Did this work? */
>
>> starts with single command.
>> hu?
>> I do not see much point in renaming ioctl() to write() all over the
>> place - at least when people see ioctl() they understand that it is
>> not standard functionality. write() will for sure confuse a lot of
>> people.
>
>
> Isn't motion "standard functionality" for this device? If so, a write
> interface fits
> even if the stuff written to it might be special (structs and such)
>
> Helge Hafting
>
On Wed, Sep 01, 2004 at 04:16:59AM -0600, [email protected] wrote:
> I do not see much point in renaming ioctl() to write() all over the place -
> at least when people see ioctl() they understand that it is not standard
> functionality. write() will for sure confuse a lot of people.
Because you can encapsulate write while you can't encapsulate ioctl
for a start. And because ascii is good and opaque binary is bad. You
don't change the target position of your motion device millions of
time per second. You can easily send your commands and replies
(you're allowed read, too) as text, and immediatly you make logging,
debugging and reproducing actions way easier. Plus API portability to
64bits/other endian is free.
OG.
Ihar 'Philips' Filipau wrote:
>
> P.P.S. Hm. Why not implement ioctl2()? which will be linked directly to
> device by its driver? - numbering will be internal to driver, and
> provide entry point into driver for user space applications. No one
> likes mess with ioctl()s in Linux - no device driver developers, nor
> users. But what is really needed - is just call into driver. Paramenter
> - single pointer have being proved to be sufficient.
>
Ok. Now I recalled those mess with ioctl()s - someone have tryed to
implement virtual methods from OO languages with file descriptors and
miserably failed.
I have never used ioctl()s for anything asides calling my drivers. I
have seens people using ioctl codes for special functionality for block
devices.
Position "ioctl()s bad" is not related to drivers per se. People
decided to not introduce another calls like eject_media(fd) or
get_info(fd) - but instead implement ioctl() which will magically work
on all fd's of block devices.
That where mistake is. That the kind of ioctl()s, which are bad.
ioctl() is for something what doesn't have interface. If something have
stable interface - it must be moved over to sys/library calls.
Instead of "painful ioctl()s" - I would rather start with solving
this problem for standard devices: making a way to implement efficiently
common functions for device classes. (terminals, block devices, network
interfaces, etc). And start obsoleting/removing ioctl()s.
I like aproach of *BSD - they routinely implement library/sys calls
for things like that. I used if_*/getif* calls to find/operate network
interfaces - it is much more usable & better documented, than Linux'
bunch of magic ioctl()s (again) on _any_ network socket. Why on any? Why
we cannot have special device to operate on list of interfaces?
I believe people here on LKML identified problem incorrectly.
Arjan van de Ven writes:
> On Wed, Sep 01, 2004 at 03:50:11AM -0600, [email protected] wrote:
>> Stop being arrogant. Can you please elaborate on how
>> to make Linux kernel support e.g. motion controllers?
>> They do not fit *any* known to me driver interface.
>> They have several axes, they have about twenty
>> parameters (float or integer), and
>
> parameters nicely fit in sysfs.
Per-command parameters included?
People really do want private syscalls. An ioctl
is that, in a namespace defined by the file
descriptor. So ioctl() provides local scope to
something that would otherwise be global.
The alternative is to add a zillion odd syscalls.
Life is messy.
>> they have several commands, a-la start, graceful stop,
>> abrupt stop. Plus obviously diagnostics - about ten
>> another commands with absolutely different parameters.
>> And about ten motion monitoring commands. And this is
>> one example I were need to program.
>
> a write() interface doesn't work???
> Hard to believe, you even call them commands.
> fd = open("/dev/funkydevice");
> write(fd, "start");
>
> doesn't sound insane to me
That's just a double-crufty ioctl in disguise.
See also: "the /proc filesystem"
It looks like encouragement of choices that will
tend toward the creation of buffer overflows in
the kernel. Nearly all buffer overflows involve
parsing ASCII. Sure, nobody should make mistakes...
People won't do that though. Know what you'll
get if ioctl isn't used? Here you go:
/////////////////////// Look ma, no ioctl! /////
struct ctldata scd;
struct ctldata *scdp = &scd;
scd.command = GET_MOTOR_TEMPERATURE;
scd.arg[0] = MOTOR_Z;
// Arjan wants it this way.
// (for Viro, must print the pointer in decimal)
write(fd, &scdp, sizeof ctldata);
motor_temp.current = scd.ret[0];
motor_temp.peak = scd.ret[1];
/////////////////////////////////////////////////
Michael S. Tsirkin writes:
> Quoting Lee Revell [snip -- that was excessive]
>> By adding a new ioctl you are adding a new use of
>> the BKL. It has been suggested on dri-devel that
>> this should be fixed. Is this even possible?
>
> I dont know - can the lock be released before the
> call to filp->f_op->ioctl ?
>
> I assume the reason its there is for legacy
> code - existing ioctls may be assuming the BKL
> is taken, but maybe there could be another flag
> in f_ops to let sys_ioctl release the lock before
> doing the call ...
>
> Like this - would that be safe?
Yes. It is proven to work.
Currently the BKL is used to synchronize access to ioctl32_hash_table
in fs/compat.c. It seems that an rwsem would be more appropriate,
since this would allow multiple lookups to occur in parallel (and also
serve the general good of minimizing use of the BKL).
I added lock_kernel()/unlock_kernel() around the call to t->handler
when a compatibility handler is found in compat_sys_ioctl() to
preserve the expectation that the BKL will be held during driver ioctl
operations. It should be safe to do lock_kernel() while holding
ioctl32_sem because of the magic BKL sleep semantics.
I have booted this and run some basic 32-bit userspace on ppc64, and
also compile tested this for x86_64 and sparc64.
Thanks,
Roland
Signed-off-by: Roland Dreier <[email protected]>
Index: linux-bk/fs/compat.c
===================================================================
--- linux-bk.orig/fs/compat.c 2004-09-01 00:57:03.000000000 -0700
+++ linux-bk/fs/compat.c 2004-09-01 07:21:20.000000000 -0700
@@ -41,6 +41,7 @@
#include <linux/nfsd/nfsd.h>
#include <linux/nfsd/syscall.h>
#include <linux/personality.h>
+#include <linux/rwsem.h>
#include <net/sock.h> /* siocdevprivate_ioctl */
@@ -247,7 +248,8 @@
/* ioctl32 stuff, used by sparc64, parisc, s390x, ppc64, x86_64, MIPS */
#define IOCTL_HASHSIZE 256
-struct ioctl_trans *ioctl32_hash_table[IOCTL_HASHSIZE];
+static struct ioctl_trans *ioctl32_hash_table[IOCTL_HASHSIZE];
+static DECLARE_RWSEM(ioctl32_sem);
extern struct ioctl_trans ioctl_start[];
extern int ioctl_table_size;
@@ -302,12 +304,12 @@
if (!new_t)
return -ENOMEM;
- lock_kernel();
+ down_write(&ioctl32_sem);
for (t = ioctl32_hash_table[hash]; t; t = t->next) {
if (t->cmd == cmd) {
printk(KERN_ERR "Trying to register duplicated ioctl32 "
"handler %x\n", cmd);
- unlock_kernel();
+ up_write(&ioctl32_sem);
kfree(new_t);
return -EINVAL;
}
@@ -317,7 +319,7 @@
new_t->handler = handler;
ioctl32_insert_translation(new_t);
- unlock_kernel();
+ up_write(&ioctl32_sem);
return 0;
}
EXPORT_SYMBOL(register_ioctl32_conversion);
@@ -337,11 +339,11 @@
unsigned long hash = ioctl32_hash(cmd);
struct ioctl_trans *t, *t1;
- lock_kernel();
+ down_write(&ioctl32_sem);
t = ioctl32_hash_table[hash];
if (!t) {
- unlock_kernel();
+ up_write(&ioctl32_sem);
return -EINVAL;
}
@@ -351,7 +353,7 @@
__builtin_return_address(0), cmd);
} else {
ioctl32_hash_table[hash] = t->next;
- unlock_kernel();
+ up_write(&ioctl32_sem);
kfree(t);
return 0;
}
@@ -366,7 +368,7 @@
goto out;
} else {
t->next = t1->next;
- unlock_kernel();
+ up_write(&ioctl32_sem);
kfree(t1);
return 0;
}
@@ -376,7 +378,7 @@
printk(KERN_ERR "Trying to free unknown 32bit ioctl handler %x\n",
cmd);
out:
- unlock_kernel();
+ up_write(&ioctl32_sem);
return -EINVAL;
}
EXPORT_SYMBOL(unregister_ioctl32_conversion);
@@ -397,7 +399,7 @@
goto out;
}
- lock_kernel();
+ down_read(&ioctl32_sem);
t = ioctl32_hash_table[ioctl32_hash (cmd)];
@@ -405,14 +407,16 @@
t = t->next;
if (t) {
if (t->handler) {
+ lock_kernel();
error = t->handler(fd, cmd, arg, filp);
unlock_kernel();
+ up_read(&ioctl32_sem);
} else {
- unlock_kernel();
+ up_read(&ioctl32_sem);
error = sys_ioctl(fd, cmd, arg);
}
} else {
- unlock_kernel();
+ up_read(&ioctl32_sem);
if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
error = siocdevprivate_ioctl(fd, cmd, arg);
} else {
Albert> Per-command parameters included?
Albert> People really do want private syscalls. An ioctl is that,
Albert> in a namespace defined by the file descriptor. So ioctl()
Albert> provides local scope to something that would otherwise be
Albert> global.
Yes, this is exactly right. One issue raised by this thread is that
the ioctl32 compatibility code only allows one compatibility handler
per ioctl number. It seems that this creates all sorts of
possibilities for mayhem because it makes the ioctl namespace global
in scope in some situations. Does anyone have any thoughts on if/how
this should be addressed.
- Roland
This thread raises the issue of the best way for a driver to handle
commands from userspace. The typical situation is where the driver
needs to process commands from multiple processes and return a status
for each command.
I happen to work on the same type of drivers as Michael (InfiniBand),
and there are a fairly large number of operations that userspace would
like to call into the kernel for. User applications ask the kernel
driver to do things like "create completion queue." One would like to
make this call in a clean, simple, efficient way.
I can think of four ways to do this:
- ioctl on char device:
Nice because it is synchronous and allows for the kernel to
return a status value easily. Has a well-defined mechanism for
handling 32-bit/64-bit compatibility. Unfortunately ioctl
methods run under the BKL.
- read/write on char device:
OK, except requires some mechanism (tag #) for matching requests
and responses. Nowhere clean to put 32/64 compatibility code.
- netlink:
Similar to read/write except it adds the possibility of dropping
messages.
- syscall:
Syscalls are great in some ways. They are the most direct way
into the kernel, they allow
However: syscalls can't be added from modules; it's (quite
correctly) very hard to get new syscalls added to the kernel;
every arch numbers its syscalls differently. Let's forget about
syscalls.
ioctls end up looking like the least bad solution, although I'm open
to other opinions and I'd love to hear better ideas. I'd be happy
with a policy of only accepting ioctls that are sparse and 32/64 clean
and generally maintainable-looking, but I don't think driver authors
have much alternative to ioctl right now.
Thanks,
Roland
* Roland Dreier ([email protected]) wrote:
> - read/write on char device:
> OK, except requires some mechanism (tag #) for matching requests
> and responses. Nowhere clean to put 32/64 compatibility code.
You forgot a driver specific filesystem which exposes requests in a file
per request type style. Also, there's a simple_transaction type of file
which can allow you send/recv data and should eliminate the need for
tagging. Example, look at nfsd fs (fs/nfsd/nfsctl.c).
thanks,
-chris
--
Linux Security Modules http://lsm.immunix.org http://lsm.bkbits.net
[email protected] wrote:
> On Wed, Sep 01, 2004 at 10:22:45AM +0300, Michael S. Tsirkin wrote:
>
>>Hello!
>>Currently, on the x86_64 architecture, its quite tricky to make
>>a char device ioctl work for an x86 executables.
>>In particular,
>> 1. there is a requirement that ioctl number is unique -
>> which is hard to guarantee especially for out of kernel modules
>
>
> Too bad.
>
>
>> 2. there's a performance huge overhead for each compat call - there's
>> a hash lookup in a global hash inside a lock_kernel -
>> and I think compat performance *is* important.
>>
>>Further, adding a command to the ioctl suddenly requires changing
>>two places - registration code and ioctl itself.
>
>
> So don't add them. Adding a new ioctl is *NOT* a step to be taken lightly -
> we used to be far too accepting in that area and as somebody who'd waded
> through the resulting dungpiles over the last months I can tell you that
> result is utterly revolting.
>
> Excuse me, but I have zero sympathy to people who complain about obstacles
> to dumping more into the same piles - it should be hard.
I don't think he was complaining so much as providing a rationale for
the patch he offered which is intended to make things better. Seemed to
me like a good context for the patch.
--
-bill davidsen ([email protected])
"The secret to procrastination is to put things off until the
last possible moment - but no longer" -me
Chris> You forgot a driver specific filesystem which exposes
Chris> requests in a file per request type style. Also, there's a
Chris> simple_transaction type of file which can allow you
Chris> send/recv data and should eliminate the need for tagging.
Chris> Example, look at nfsd fs (fs/nfsd/nfsctl.c).
Thanks, I'll take a look at this. How does one handle the 32-bit
userspace / 64-bit kernel compat layer in this setup?
- R.
On Wed, Sep 01, 2004 at 11:12:46AM -0700, Roland Dreier wrote:
> Chris> You forgot a driver specific filesystem which exposes
> Chris> requests in a file per request type style. Also, there's a
> Chris> simple_transaction type of file which can allow you
> Chris> send/recv data and should eliminate the need for tagging.
> Chris> Example, look at nfsd fs (fs/nfsd/nfsctl.c).
>
> Thanks, I'll take a look at this. How does one handle the 32-bit
> userspace / 64-bit kernel compat layer in this setup?
The sane approach is to have your structure platform-independent and have
no compat code at all. Which is what we do with on-disk and over-the-wire
structures anyway. And before Albert starts usual wanksession, no, ASCII
is not the only way to do that...
Hello!
Quoting r. Albert Cahalan ([email protected]) "Re: f_ops flag to speed up compatible ioctls in linux kernel":
> Michael S. Tsirkin writes:
> > Quoting Lee Revell [snip -- that was excessive]
>
> >> By adding a new ioctl you are adding a new use of
> >> the BKL. It has been suggested on dri-devel that
> >> this should be fixed. Is this even possible?
> >
> > I dont know - can the lock be released before the
> > call to filp->f_op->ioctl ?
> >
> > I assume the reason its there is for legacy
> > code - existing ioctls may be assuming the BKL
> > is taken, but maybe there could be another flag
> > in f_ops to let sys_ioctl release the lock before
> > doing the call ...
> >
> > Like this - would that be safe?
>
> Yes. It is proven to work.
Now that I look at the ioctl.c code, I see a several get_user/put_user
inside the ioctl which are thus done while BKL is held.
But I thought get_user can block?
Why is this not a bug?
MST
Chris> You forgot a driver specific filesystem which exposes
Chris> requests in a file per request type style. Also, there's a
Chris> simple_transaction type of file which can allow you
Chris> send/recv data and should eliminate the need for tagging.
Chris> Example, look at nfsd fs (fs/nfsd/nfsctl.c).
Thanks for the pointer -- I had a look at this stuff. It seems that
using the simple_transaction stuff is fairly heavyweight -- if I
understand correctly, every operation requires userspace to do
open()-write()-read()-close(), and also uses a page of lowmem. I'm
not sure if this is the best fit for our requirements with InfiniBand
drivers: although the user->kernel calls are not in the data path,
there can still be quite a few of them.
On the other hand, ioctl() holds the BKL through the whole operation
so that's suboptimal as well.
Thanks,
Roland
On Wed, 2004-09-01 at 17:23, Michael S. Tsirkin wrote:
> Hello!
> Quoting r. Albert Cahalan ([email protected]) "Re: f_ops flag to speed up compatible ioctls in linux kernel":
> > Michael S. Tsirkin writes:
> > > Quoting Lee Revell [snip -- that was excessive]
> >
> > >> By adding a new ioctl you are adding a new use of
> > >> the BKL. It has been suggested on dri-devel that
> > >> this should be fixed. Is this even possible?
> > >
> > > I dont know - can the lock be released before the
> > > call to filp->f_op->ioctl ?
> > >
> > > I assume the reason its there is for legacy
> > > code - existing ioctls may be assuming the BKL
> > > is taken, but maybe there could be another flag
> > > in f_ops to let sys_ioctl release the lock before
> > > doing the call ...
> > >
> > > Like this - would that be safe?
> >
> > Yes. It is proven to work.
>
> Now that I look at the ioctl.c code, I see a several get_user/put_user
> inside the ioctl which are thus done while BKL is held.
> But I thought get_user can block?
>
> Why is this not a bug?
>
You can sleep while holding the BKL, it is automatically dropped and
reacquired. The BKL has some magic properties, it does not work like a
regular spinlock.
Lee
Michael> Now that I look at the ioctl.c code, I see a several
Michael> get_user/put_user inside the ioctl which are thus done
Michael> while BKL is held. But I thought get_user can block?
Michael> Why is this not a bug?
It's fine to sleep while holding the BKL (it will be dropped and then
reacquired when the process wakes up).
- Roland
* Michael S. Tsirkin ([email protected]) wrote:
> Now that I look at the ioctl.c code, I see a several get_user/put_user
> inside the ioctl which are thus done while BKL is held.
> But I thought get_user can block?
>
> Why is this not a bug?
Look at release_kernel_lock, esp. in kernel/sched.c.
-chris
Hello!
Quoting r. Roland Dreier ([email protected]) "Re: f_ops flag to speed up compatible ioctls in linux kernel":
> Albert> Per-command parameters included?
>
> Albert> People really do want private syscalls. An ioctl is that,
> Albert> in a namespace defined by the file descriptor. So ioctl()
> Albert> provides local scope to something that would otherwise be
> Albert> global.
>
> Yes, this is exactly right. One issue raised by this thread is that
> the ioctl32 compatibility code only allows one compatibility handler
> per ioctl number. It seems that this creates all sorts of
> possibilities for mayhem because it makes the ioctl namespace global
> in scope in some situations. Does anyone have any thoughts on if/how
> this should be addressed.
>
Thats what my original patch attempts to address
http://www.uwsg.indiana.edu/hypermail/linux/kernel/0409.0/0025.html
What do you think?
Roland Dreier <[email protected]> wrote:
>
> Currently the BKL is used to synchronize access to ioctl32_hash_table
> in fs/compat.c. It seems that an rwsem would be more appropriate,
> since this would allow multiple lookups to occur in parallel (and also
> serve the general good of minimizing use of the BKL).
It introduces additional bus-atomic operations into the fastpath, so we'll
be slower in the one-process-doing-lots-of ioctls case, and faster in the
lots-of-cpus-doing-ioctls case.
The change certainly makes sense from a clean-things-up and
prepare-for-bkl-removal point of view, however.
Some single-threaded and multi-threaded SMP microbenchmarking would be
nice, if you have time.
Quoting r. Roland Dreier ([email protected]) "Re: f_ops flag to speed up compatible ioctls in linux kernel":
> Roland> Yes, this is exactly right. One issue raised by this
> Roland> thread is that the ioctl32 compatibility code only allows
> Roland> one compatibility handler per ioctl number. It seems that
> Roland> this creates all sorts of possibilities for mayhem because
> Roland> it makes the ioctl namespace global in scope in some
> Roland> situations. Does anyone have any thoughts on if/how this
> Roland> should be addressed?
>
> Michael> Thats what my original patch attempts to address
> Michael> http://www.uwsg.indiana.edu/hypermail/linux/kernel/0409.0/0025.html
> Michael> What do you think?
>
> That patch seems somewhat orthogonal to the issue I raised. You're
> just fixing the problem for devices that don't use the ioctl32 compat
> layer.
>
No I'm removing the need for ioctls to go through the compat layer.
No compat layer, no problem.
I see the compat layer as a hack to support legacy devices without touching
all of their code.
I dont see why a new driver would use the compat hacks instead of simply
building a 32/64 bit compatible interface, and legacy drivers have the
uniqueness for commands solved one way or another.
So the only interesting problem left is for new char devices.
MST
Roland> Yes, this is exactly right. One issue raised by this
Roland> thread is that the ioctl32 compatibility code only allows
Roland> one compatibility handler per ioctl number. It seems that
Roland> this creates all sorts of possibilities for mayhem because
Roland> it makes the ioctl namespace global in scope in some
Roland> situations. Does anyone have any thoughts on if/how this
Roland> should be addressed?
Michael> Thats what my original patch attempts to address
Michael> http://www.uwsg.indiana.edu/hypermail/linux/kernel/0409.0/0025.html
Michael> What do you think?
That patch seems somewhat orthogonal to the issue I raised. You're
just fixing the problem for devices that don't use the ioctl32 compat
layer.
- Roland
Chris> Trivial addition of allowing llseek to become transaction
Chris> barrier increases this to ~300,000 transactions per second.
Chris> Patch below gives you some basic idea.
Wow, great stuff. Thanks a lot for taking the time to try this out.
Also thanks for the original suggestion -- the more I think about the
private filesystem idea the more I like it.
Do you think simple_transaction_llseek() should be merged upstream?
It would definitely make the simple_transaction stuff more useful for
my application.
Thanks,
Roland
Ihar 'Philips' Filipau wrote:
>
>
> Well. I given an example of device which doesn't fit into
> read()/write() interface.
>
> Your imagination is truely appreciated - and you are encouraged to
> try to implement it. With full error handling (every write must be
> checked)
Checking every write is up to the program using the device. Not
forgetting this is easy.
Never write directly to the device, use an interface function in your
program that
both writes, reads status back, and returns that status.
> and multithreading support (e.g. one user for movement, multiple for
> monitoring and diagnosis).
>
In that case I'd use two device nodes. One for the command/error stuff,
and another for
monitoring. The monitoring device could be read-only and support
several simultaneous
openings.
> I have tryed to some extent impelementing something like this -
> amount of code doubled for no real gain. (It was in those times when
> "ioctl()s are bad. period." topic poped up on lkml first time. long
> time ago)
Of course you _can_ use ioctls. Just do it if that's the only useable
way. Wether you can get code
like that merged depends on wether you can convince the right people.
But your driver will work
fine even if you have to distribute it yourself, so it isn't that big a
problem.
>
> No one will ever design interface for sake of interface. what you
> are proposing - probably will look nice in academical papers, but no
> one will do it, and no one will do it in kernel space.
>
> And all this stuff - all this funcy xml-like comlications? - for
> what? just call one function of driver with single parameter - void*.
> _*Not more*_. Ridiculous indeed.
>
I haven't suggested any xml, perhaps someone else did. A write
interface let you pass
almost anything into the driver. My example used ascii messages, you
might want to
pass structs instead. I believe even pointers should work too.
> P.S. As I have said before, it seems to me that using read()/write()
> instead of ioctl() could be the only choice. I once hacked read():
> user must pass two structures: first is input, second is output. It
> worked - but no one (including me) liked it. ioctl() even by its name
> fit better.
I have to agree it seems to fit.
Helge Hafting
On Wed, Sep 01, 2004 at 08:40:15AM -0700, Roland Dreier wrote:
> Currently the BKL is used to synchronize access to ioctl32_hash_table
> in fs/compat.c. It seems that an rwsem would be more appropriate,
> since this would allow multiple lookups to occur in parallel (and also
> serve the general good of minimizing use of the BKL).
>
> I added lock_kernel()/unlock_kernel() around the call to t->handler
> when a compatibility handler is found in compat_sys_ioctl() to
> preserve the expectation that the BKL will be held during driver ioctl
> operations. It should be safe to do lock_kernel() while holding
> ioctl32_sem because of the magic BKL sleep semantics.
>
> I have booted this and run some basic 32-bit userspace on ppc64, and
> also compile tested this for x86_64 and sparc64.
It does not make much sense because the ioctl will take the BKL
anyways.
If you wanted to fix it properly better make it use RCU -
but it cannot work for the case of calling a compat handler.
-Andi
Andi> It does not make much sense because the ioctl will take the
Andi> BKL anyways.
True, but it seems pretty ugly to protect the ioctl32 hash with the
BKL. I think the greater good of reducing use of the BKL should be
looked at.
Andi> If you wanted to fix it properly better make it use RCU -
Andi> but it cannot work for the case of calling a compat handler.
I'm not sure I follow what you're saying. When I looked at this, at
first I thought RCU would be better for the hash lookup, but I didn't
see a way to prevent a compat handler from being removed while it was
running. That's why I moved to a semaphore, which would hold off the
removal until the handler was done running. Is this what you mean?
Do you see a way to use RCU here?
Thanks,
Roland
On Wed, Sep 01, 2004 at 10:22:45AM +0300, Michael S. Tsirkin wrote:
> Hello!
> Currently, on the x86_64 architecture, its quite tricky to make
> a char device ioctl work for an x86 executables.
> In particular,
> 1. there is a requirement that ioctl number is unique -
> which is hard to guarantee especially for out of kernel modules
Yes, that is a problem for some people. But you should
have used an unique number in the first place.
There are some hackish ways to work around it for non modules[1], but at some
point we should probably support it better.
[1] it can be handled, except for module unloading, so you have
to disable that.
> 2. there's a performance huge overhead for each compat call - there's
> a hash lookup in a global hash inside a lock_kernel -
> and I think compat performance *is* important.
Did you actually measure it? I doubt it is a big issue.
-Andi
On Thu, Sep 02, 2004 at 03:26:49PM -0700, Roland Dreier wrote:
> Andi> It does not make much sense because the ioctl will take the
> Andi> BKL anyways.
>
> True, but it seems pretty ugly to protect the ioctl32 hash with the
> BKL. I think the greater good of reducing use of the BKL should be
> looked at.
Replacing one lock with two is IMHO a bad idea. Making the number
of locks smaller and avoiding the locking wall IMHO has priority
even over BKL removal.
>
> Andi> If you wanted to fix it properly better make it use RCU -
> Andi> but it cannot work for the case of calling a compat handler.
>
> I'm not sure I follow what you're saying. When I looked at this, at
> first I thought RCU would be better for the hash lookup, but I didn't
> see a way to prevent a compat handler from being removed while it was
> running. That's why I moved to a semaphore, which would hold off the
> removal until the handler was done running. Is this what you mean?
> Do you see a way to uose RCU here?
The code currently assumes that the compat code is either
in the kernel or in the same module who implements the
device (then the high level module count handling for
the file descriptor takes care of it)
The BKL couldn't protect again removal of sleeping compat
handlers anyways because the BKL is dropped during a
schedule, and they all can sleep in user accesses.
During this scheduling window the module could be unloaded
if its count was zero. But with the assumption above this
cannot happen.
So basically the locking there is not to protect against
running handlers, just to ensure consistency during
the list walking. The list isn't touched anymore
after the compat handler runs, so the sleeping in there
is no problem.
RCU could be used as well to protect the list because
there is no sleep involved.
-Andi
Andi> The BKL couldn't protect again removal of sleeping compat
Andi> handlers anyways because the BKL is dropped during a
Andi> schedule, and they all can sleep in user accesses. During
Andi> this scheduling window the module could be unloaded if its
Andi> count was zero. But with the assumption above this cannot
Andi> happen.
Andi> So basically the locking there is not to protect against
Andi> running handlers, just to ensure consistency during the list
Andi> walking. The list isn't touched anymore after the compat
Andi> handler runs, so the sleeping in there is no problem.
Andi> RCU could be used as well to protect the list because there
Andi> is no sleep involved.
OK, good point. My logic was wrong when I considered RCU. But now I
don't understand what you meant by "it cannot work" when you wrote
this in your previous email:
Andi> If you wanted to fix it properly better make it use RCU -
Andi> but it cannot work for the case of calling a compat handler.
Based on what you just wrote, it seems to me like RCU would actually
work fine. Is this wrong?
Thanks,
Roland
> Andi> If you wanted to fix it properly better make it use RCU -
> Andi> but it cannot work for the case of calling a compat handler.
>
> Based on what you just wrote, it seems to me like RCU would actually
> work fine. Is this wrong?
I meant it wouldn't work if you wanted to fix the module count assumption
I described above. Fixing it would be probably a good idea because it's a
bit of a nasty trap and kind of undocumented. But a global lock for
it is not the right way to do it, if anything you would use the
module counts.
-Andi
Hello!
Quoting Andi Kleen ([email protected]) "Re: [discuss] f_ops flag to speed up compatible ioctls in linux kernel":
> On Wed, Sep 01, 2004 at 10:22:45AM +0300, Michael S. Tsirkin wrote:
> > Hello!
> > Currently, on the x86_64 architecture, its quite tricky to make
> > a char device ioctl work for an x86 executables.
> > In particular,
> > 1. there is a requirement that ioctl number is unique -
> > which is hard to guarantee especially for out of kernel modules
>
> Yes, that is a problem for some people. But you should
> have used an unique number in the first place.
Do you mean the _IOC macro and friends?
But their uniqueness depends on allocating a unique magic number
in the first place.
> There are some hackish ways to work around it for non modules[1], but at some
> point we should probably support it better.
>
> [1] it can be handled, except for module unloading, so you have
> to disable that.
Why use the global hash at all?
Why not, for example, pass a parameter to the ioctl function
to make it possible to figure out this is a compat call?
> > 2. there's a performance huge overhead for each compat call - there's
> > a hash lookup in a global hash inside a lock_kernel -
> > and I think compat performance *is* important.
>
> Did you actually measure it? I doubt it is a big issue.
>
But that would depend on what the driver actually does inside
the ioctl and on how many ioctls are already registered, would it not?
I built a silly driver example which just used a semaphore and a switch
statement inside the ioctl.
~/<1>tavor/tools/driver_new>time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
0.357u 4.760s 0:05.11 100.0% 0+0k 0+0io 0pf+0w
~/<1>tavor/tools/driver_new>time /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
0.641u 6.486s 0:07.12 100.0% 0+0k 0+0io 0pf+0w
So just looking at system time there seems to be an overhead of
about 20%.
The overhead is bigger if there are collisions in the hash.
For muti-processor scenarious, the difference is much more pronounced
(note I have dual-cpu Opteron system):
~>time /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0 & ;time /tmp/ioctltest32
/dev/mst/mt23108_pci_cr0 &
[2] 10829
[3] 10830
[2] Done /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
0.435u 21.322s 0:21.76 99.9% 0+0k 0+0io 0pf+0w
[3] Done /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
0.683u 21.231s 0:21.92 99.9% 0+0k 0+0io 0pf+0w
~>
~>time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0 & ;time /tmp/ioctltest64
/dev/mst/mt23108_pci_cr0 &
[2] 10831
[3] 10832
[3] Done /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
0.474u 11.194s 0:11.70 99.6% 0+0k 0+0io 0pf+0w
[2] Done /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
0.476u 11.277s 0:11.75 99.9% 0+0k 0+0io 0pf+0w
~>
So we get 50% slowdown.
I imagine this is the result of BKL contention during the hash lookup.
MST
On Tue, Sep 07, 2004 at 01:40:17PM +0300, Michael S. Tsirkin wrote:
> Hello!
> Quoting Andi Kleen ([email protected]) "Re: [discuss] f_ops flag to speed up compatible ioctls in linux kernel":
> > On Wed, Sep 01, 2004 at 10:22:45AM +0300, Michael S. Tsirkin wrote:
> > > Hello!
> > > Currently, on the x86_64 architecture, its quite tricky to make
> > > a char device ioctl work for an x86 executables.
> > > In particular,
> > > 1. there is a requirement that ioctl number is unique -
> > > which is hard to guarantee especially for out of kernel modules
> >
> > Yes, that is a problem for some people. But you should
> > have used an unique number in the first place.
>
> Do you mean the _IOC macro and friends?
> But their uniqueness depends on allocating a unique magic number
> in the first place.
Yep. It's not bullet proof, but works pretty well in practice with
a little care.
>
> > There are some hackish ways to work around it for non modules[1], but at some
> > point we should probably support it better.
> >
> > [1] it can be handled, except for module unloading, so you have
> > to disable that.
>
> Why use the global hash at all?
> Why not, for example, pass a parameter to the ioctl function
> to make it possible to figure out this is a compat call?
The main reason is that traditionally there was some resistance
to put compat code into the drivers itself because it "looked too
ugly". So it was just put into a few centralized files. Patching
all the f_ops wouldn't have been practical for this.
Maybe it could be added as an additional mechanism now though.
> > > 2. there's a performance huge overhead for each compat call - there's
> > > a hash lookup in a global hash inside a lock_kernel -
> > > and I think compat performance *is* important.
> >
> > Did you actually measure it? I doubt it is a big issue.
> >
>
> But that would depend on what the driver actually does inside
> the ioctl and on how many ioctls are already registered, would it not?
Most ioctls should be registered at boot, the additional ones
are probably negligible.
>
> I built a silly driver example which just used a semaphore and a switch
> statement inside the ioctl.
>
> ~/<1>tavor/tools/driver_new>time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
> 0.357u 4.760s 0:05.11 100.0% 0+0k 0+0io 0pf+0w
> ~/<1>tavor/tools/driver_new>time /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
> 0.641u 6.486s 0:07.12 100.0% 0+0k 0+0io 0pf+0w
>
> So just looking at system time there seems to be an overhead of
> about 20%.
That's with an empty ioctl? I would expect most ioctls to do
more work, so the overhead would be less.
Still it could be probably made better.
> The overhead is bigger if there are collisions in the hash.
>
> For muti-processor scenarious, the difference is much more pronounced
> (note I have dual-cpu Opteron system):
>
> ~>time /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0 & ;time /tmp/ioctltest32
> /dev/mst/mt23108_pci_cr0 &
> [2] 10829
> [3] 10830
> [2] Done /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
> 0.435u 21.322s 0:21.76 99.9% 0+0k 0+0io 0pf+0w
> [3] Done /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
> 0.683u 21.231s 0:21.92 99.9% 0+0k 0+0io 0pf+0w
> ~>
>
>
> ~>time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0 & ;time /tmp/ioctltest64
> /dev/mst/mt23108_pci_cr0 &
> [2] 10831
> [3] 10832
> [3] Done /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
> 0.474u 11.194s 0:11.70 99.6% 0+0k 0+0io 0pf+0w
> [2] Done /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
> 0.476u 11.277s 0:11.75 99.9% 0+0k 0+0io 0pf+0w
> ~>
>
> So we get 50% slowdown.
> I imagine this is the result of BKL contention during the hash lookup.
Ok, this could be improved agreed (although I still think your microbenchmark
is a bit too artificial)
In theory the BKL could be dropped from the lookup anyways
if RCU is needed for the cleanup. For locking the handler
itself into memory it doesn't make any difference.
What happens when you just remove the lock_kernel() there?
(as long as you don't unload any modules this should be safe)
-Andi
Hello!
Quoting r. Andi Kleen ([email protected]) "Re: [discuss] f_ops flag to speed up compatible ioctls in linux kernel":
> On Tue, Sep 07, 2004 at 01:40:17PM +0300, Michael S. Tsirkin wrote:
> > Hello!
> > Quoting Andi Kleen ([email protected]) "Re: [discuss] f_ops flag to speed up compatible ioctls in linux kernel":
> > > On Wed, Sep 01, 2004 at 10:22:45AM +0300, Michael S. Tsirkin wrote:
> > > > Hello!
> > > > Currently, on the x86_64 architecture, its quite tricky to make
> > > > a char device ioctl work for an x86 executables.
> > > > In particular,
> > > > 1. there is a requirement that ioctl number is unique -
> > > > which is hard to guarantee especially for out of kernel modules
> > >
> > > Yes, that is a problem for some people. But you should
> > > have used an unique number in the first place.
> >
> > Do you mean the _IOC macro and friends?
> > But their uniqueness depends on allocating a unique magic number
> > in the first place.
>
> Yep. It's not bullet proof, but works pretty well in practice with
> a little care.
Hrmp. I for one *would* like something moer bulletproof.
> >
> > > There are some hackish ways to work around it for non modules[1], but at some
> > > point we should probably support it better.
> > >
> > > [1] it can be handled, except for module unloading, so you have
> > > to disable that.
> >
> > Why use the global hash at all?
> > Why not, for example, pass a parameter to the ioctl function
> > to make it possible to figure out this is a compat call?
>
> The main reason is that traditionally there was some resistance
> to put compat code into the drivers itself because it "looked too
> ugly". So it was just put into a few centralized files. Patching
> all the f_ops wouldn't have been practical for this.
>
> Maybe it could be added as an additional mechanism now though.
I'll try to add it and see what this does not performance,
if this helps I'll send a patch.
> > > > 2. there's a performance huge overhead for each compat call - there's
> > > > a hash lookup in a global hash inside a lock_kernel -
> > > > and I think compat performance *is* important.
> > >
> > > Did you actually measure it? I doubt it is a big issue.
> > >
> >
> > But that would depend on what the driver actually does inside
> > the ioctl and on how many ioctls are already registered, would it not?
>
> Most ioctls should be registered at boot, the additional ones
> are probably negligible.
But this does not matter - the hash collision will add overhead
on each lookup - and whether you have collisions is a matter of luck -
theoretically, some users may use such drivers that you may always have
collisions.
> >
> > I built a silly driver example which just used a semaphore and a switch
> > statement inside the ioctl.
> >
> > ~/<1>tavor/tools/driver_new>time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
> > 0.357u 4.760s 0:05.11 100.0% 0+0k 0+0io 0pf+0w
> > ~/<1>tavor/tools/driver_new>time /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
> > 0.641u 6.486s 0:07.12 100.0% 0+0k 0+0io 0pf+0w
> >
> > So just looking at system time there seems to be an overhead of
> > about 20%.
>
> That's with an empty ioctl?
Not exactly empty - below's the code snippet.
***
static int ioctl (struct inode *inode, struct file *file, unsigned int opcode, unsigned long udata_l)
{
void* udata=(void*)udata_l;
int minor=MINOR(inode->i_rdev);
struct dev_data* dev=&devices[minor];
int ret=0;
/* By convention, any user gets read access
* and is allowed to use the device.
* Commands with no direction are administration
* commands, and you need write permission
* for this */
if ( _IOC_DIR(opcode) == _IOC_NONE ) {
if (! ( file->f_mode & FMODE_WRITE) ) return -EPERM;
} else {
if (! ( file->f_mode & FMODE_READ) ) return -EPERM;
}
if (down_interruptible(&devices[minor].sem)) {
return -ERESTARTSYS;
}
switch (opcode) {
/* .. snip .. */
case PARAMS:
{
struct mst_pci_params_st paramsd;
paramsd.bar=dev->bar;
paramsd.size=dev->size;
if (copy_to_user(udata, ¶msd, sizeof(paramsd))) {
ret=-EFAULT;
}
goto fin;
}
default:
ret= -ENOTTY;
goto fin;
}
fin:
up(&devices[minor].sem);
return ret;
}
***
> I would expect most ioctls to do
> more work, so the overhead would be less.
> Still it could be probably made better.
Then I expect you'll get bitten by the BKL taken while ioctl runs.
That's another issue that needs addressing, in my opinion.
> > The overhead is bigger if there are collisions in the hash.
> >
> > For muti-processor scenarious, the difference is much more pronounced
> > (note I have dual-cpu Opteron system):
> >
> > ~>time /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0 & ;time /tmp/ioctltest32
> > /dev/mst/mt23108_pci_cr0 &
> > [2] 10829
> > [3] 10830
> > [2] Done /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
> > 0.435u 21.322s 0:21.76 99.9% 0+0k 0+0io 0pf+0w
> > [3] Done /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
> > 0.683u 21.231s 0:21.92 99.9% 0+0k 0+0io 0pf+0w
> > ~>
> >
> >
> > ~>time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0 & ;time /tmp/ioctltest64
> > /dev/mst/mt23108_pci_cr0 &
> > [2] 10831
> > [3] 10832
> > [3] Done /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
> > 0.474u 11.194s 0:11.70 99.6% 0+0k 0+0io 0pf+0w
> > [2] Done /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
> > 0.476u 11.277s 0:11.75 99.9% 0+0k 0+0io 0pf+0w
> > ~>
> >
> > So we get 50% slowdown.
> > I imagine this is the result of BKL contention during the hash lookup.
>
>
> Ok, this could be improved agreed (although I still think your microbenchmark
> is a bit too artificial)
>
> In theory the BKL could be dropped from the lookup anyways
> if RCU is needed for the cleanup. For locking the handler
> itself into memory it doesn't make any difference.
>
> What happens when you just remove the lock_kernel() there?
> (as long as you don't unload any modules this should be safe)
>
> -Andi
Well, I personally do want to enable module unloading.
I think I'll add a new entry point to f_ops and see what *this* does
to speed. That would be roughly equivalent, and cleaner, right?
MST
On Tue, Sep 07, 2004 at 04:45:18PM +0300, Michael S. Tsirkin wrote:
> > > I built a silly driver example which just used a semaphore and a switch
> > > statement inside the ioctl.
> > >
> > > ~/<1>tavor/tools/driver_new>time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
> > > 0.357u 4.760s 0:05.11 100.0% 0+0k 0+0io 0pf+0w
> > > ~/<1>tavor/tools/driver_new>time /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
> > > 0.641u 6.486s 0:07.12 100.0% 0+0k 0+0io 0pf+0w
> > >
> > > So just looking at system time there seems to be an overhead of
> > > about 20%.
> >
> > That's with an empty ioctl?
>
> Not exactly empty - below's the code snippet.
Hmm, ok. Surprising then. Can you profile it to see where
the bottleneck exactly is?
> > I would expect most ioctls to do
> > more work, so the overhead would be less.
> > Still it could be probably made better.
>
> Then I expect you'll get bitten by the BKL taken while ioctl runs.
Yes, but that's a general problem, not specific to compat ioctls.
So far nobody dared to drop the BKL for ioctls because it would
require to audit/fix a *lot* of code.
The idea of taking the BKL during the hash lookup was that
when the BKL is taken anyways it doesn't make too much
difference to take it a little bit longer. But this assumed
that the hash lookup is fast. If it isn't maybe the hash
function should just be optimized a bit or the table increased.
Most of the values are known at compile time, so maybe
some perfect hash generator like gperf could be used to
generate a better hash?
> >
> > In theory the BKL could be dropped from the lookup anyways
> > if RCU is needed for the cleanup. For locking the handler
> > itself into memory it doesn't make any difference.
> >
> > What happens when you just remove the lock_kernel() there?
> > (as long as you don't unload any modules this should be safe)
> >
> > -Andi
>
> Well, I personally do want to enable module unloading.
It works fine as long as the compat function is in the same
module as the one providing the file_ops.
> I think I'll add a new entry point to f_ops and see what *this* does
> to speed. That would be roughly equivalent, and cleaner, right?
It may help your module, but won't solve the general problem shorter
term.
-Andi
Hello!
Quoting r. Andi Kleen ([email protected]) "Re: [discuss] f_ops flag to speed up compatible ioctls in linux kernel":
> On Tue, Sep 07, 2004 at 04:45:18PM +0300, Michael S. Tsirkin wrote:
> > > > I built a silly driver example which just used a semaphore and a switch
> > > > statement inside the ioctl.
> > > >
> > > > ~/<1>tavor/tools/driver_new>time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
> > > > 0.357u 4.760s 0:05.11 100.0% 0+0k 0+0io 0pf+0w
> > > > ~/<1>tavor/tools/driver_new>time /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
> > > > 0.641u 6.486s 0:07.12 100.0% 0+0k 0+0io 0pf+0w
> > > >
> > > > So just looking at system time there seems to be an overhead of
> > > > about 20%.
> > >
> > > That's with an empty ioctl?
> >
> > Not exactly empty - below's the code snippet.
>
> Hmm, ok. Surprising then. Can you profile it to see where
> the bottleneck exactly is?
>
>
>
> > > I would expect most ioctls to do
> > > more work, so the overhead would be less.
> > > Still it could be probably made better.
> >
> > Then I expect you'll get bitten by the BKL taken while ioctl runs.
>
> Yes, but that's a general problem, not specific to compat ioctls.
>
> So far nobody dared to drop the BKL for ioctls because it would
> require to audit/fix a *lot* of code.
But if we have a new entry point in f_ops, drivers will either
be audited as they are migrated to this, or will just take
the BKL themselves.
> The idea of taking the BKL during the hash lookup was that
> when the BKL is taken anyways it doesn't make too much
> difference to take it a little bit longer. But this assumed
> that the hash lookup is fast. If it isn't maybe the hash
> function should just be optimized a bit or the table increased.
>
> Most of the values are known at compile time, so maybe
> some perfect hash generator like gperf could be used to
> generate a better hash?
>
>
> > >
> > > In theory the BKL could be dropped from the lookup anyways
> > > if RCU is needed for the cleanup. For locking the handler
> > > itself into memory it doesn't make any difference.
> > >
> > > What happens when you just remove the lock_kernel() there?
> > > (as long as you don't unload any modules this should be safe)
> > >
> > > -Andi
> >
> > Well, I personally do want to enable module unloading.
>
> It works fine as long as the compat function is in the same
> module as the one providing the file_ops.
>
> > I think I'll add a new entry point to f_ops and see what *this* does
> > to speed. That would be roughly equivalent, and cleaner, right?
>
> It may help your module, but won't solve the general problem shorter
> term.
>
> -Andi
But longer term it will be better, so why not go there?
Once the infrastructure is there, drivers will be able to be
migrated as required.
MSt
On Tue, Sep 07, 2004 at 05:25:30PM +0300, Michael S. Tsirkin wrote:
> > It may help your module, but won't solve the general problem shorter
> > term.
> But longer term it will be better, so why not go there?
> Once the infrastructure is there, drivers will be able to be
> migrated as required.
I have no problems with that. You would need two new entry points:
one 64bit one without BKL and a 32bit one also without BKL.
I think there were some objections to this scheme in the past,
but I cannot think of a good alternative.
-Andi
On Tue, Sep 07, 2004 at 05:37:02PM +0300, Michael S. Tsirkin wrote:
> Hello!
> Quoting r. Andi Kleen ([email protected]) "Re: [discuss] f_ops flag to speed up compatible ioctls in linux kernel":
> > On Tue, Sep 07, 2004 at 05:25:30PM +0300, Michael S. Tsirkin wrote:
> > > > It may help your module, but won't solve the general problem shorter
> > > > term.
> > > But longer term it will be better, so why not go there?
> > > Once the infrastructure is there, drivers will be able to be
> > > migrated as required.
> >
> > I have no problems with that. You would need two new entry points:
> > one 64bit one without BKL and a 32bit one also without BKL.
> >
> > I think there were some objections to this scheme in the past,
> > but I cannot think of a good alternative.
> >
>
> Maybe one entry point with a flag?
That would be IMHO far uglier than two.
-Andi
On Tue, Sep 07, 2004 at 04:29:46PM +0200, Andi Kleen wrote:
> On Tue, Sep 07, 2004 at 05:25:30PM +0300, Michael S. Tsirkin wrote:
> > > It may help your module, but won't solve the general problem shorter
> > > term.
> > But longer term it will be better, so why not go there?
> > Once the infrastructure is there, drivers will be able to be
> > migrated as required.
>
> I have no problems with that. You would need two new entry points:
> one 64bit one without BKL and a 32bit one also without BKL.
>
> I think there were some objections to this scheme in the past,
> but I cannot think of a good alternative.
uhm, apologies for dropping in, for what exactly
are two (new) separate entry points needed?
somehow lost context here ...
TIA,
Herbert
> -Andi
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
On Tue, Sep 07, 2004 at 05:45:43PM +0300, Michael S. Tsirkin wrote:
> > > > but I cannot think of a good alternative.
> > > >
> > >
> > > Maybe one entry point with a flag?
> >
> > That would be IMHO far uglier than two.
> >
> > -Andi
> >
>
> What would be a good name? ioctl32/ioctl64? ioctl_compat/ioctl_native?
Later two sound ok to me.
-Andi
Hello!
Quoting r. Andi Kleen ([email protected]) "Re: [discuss] f_ops flag to speed up compatible ioctls in linux kernel":
> On Tue, Sep 07, 2004 at 05:37:02PM +0300, Michael S. Tsirkin wrote:
> > Hello!
> > Quoting r. Andi Kleen ([email protected]) "Re: [discuss] f_ops flag to speed up compatible ioctls in linux kernel":
> > > On Tue, Sep 07, 2004 at 05:25:30PM +0300, Michael S. Tsirkin wrote:
> > > > > It may help your module, but won't solve the general problem shorter
> > > > > term.
> > > > But longer term it will be better, so why not go there?
> > > > Once the infrastructure is there, drivers will be able to be
> > > > migrated as required.
> > >
> > > I have no problems with that. You would need two new entry points:
> > > one 64bit one without BKL and a 32bit one also without BKL.
> > >
> > > I think there were some objections to this scheme in the past,
> > > but I cannot think of a good alternative.
> > >
> >
> > Maybe one entry point with a flag?
>
> That would be IMHO far uglier than two.
>
> -Andi
>
What would be a good name? ioctl32/ioctl64? ioctl_compat/ioctl_native?
Hello!
Quoting r. Andi Kleen ([email protected]) "Re: [discuss] f_ops flag to speed up compatible ioctls in linux kernel":
> On Tue, Sep 07, 2004 at 05:25:30PM +0300, Michael S. Tsirkin wrote:
> > > It may help your module, but won't solve the general problem shorter
> > > term.
> > But longer term it will be better, so why not go there?
> > Once the infrastructure is there, drivers will be able to be
> > migrated as required.
>
> I have no problems with that. You would need two new entry points:
> one 64bit one without BKL and a 32bit one also without BKL.
>
> I think there were some objections to this scheme in the past,
> but I cannot think of a good alternative.
>
Maybe one entry point with a flag?
Compatible ioctls could just ignore the flag.
MST
Hello!
Quoting r. Herbert Poetzl ([email protected]) "Re: [discuss] f_ops flag to speed up compatible ioctls in linux kernel":
> On Tue, Sep 07, 2004 at 04:29:46PM +0200, Andi Kleen wrote:
> > On Tue, Sep 07, 2004 at 05:25:30PM +0300, Michael S. Tsirkin wrote:
> > > > It may help your module, but won't solve the general problem shorter
> > > > term.
> > > But longer term it will be better, so why not go there?
> > > Once the infrastructure is there, drivers will be able to be
> > > migrated as required.
> >
> > I have no problems with that. You would need two new entry points:
> > one 64bit one without BKL and a 32bit one also without BKL.
> >
> > I think there were some objections to this scheme in the past,
> > but I cannot think of a good alternative.
>
> uhm, apologies for dropping in, for what exactly
> are two (new) separate entry points needed?
>
> somehow lost context here ...
>
> TIA,
> Herbert
There are two uses BKL in the ioctl call path:
1. BKL is kept across the whole ioctl call
2. BKL is used to protect the compat hash lookup
So ioctl_native is to let drivers declare they dont need the BKL
in they ioctl.
ioctl_compat is to let drivers declare
they dont need the hash lookup so it can be replaced by direct call by pointer.
MST
Hello!
Quoting r. Andi Kleen ([email protected]) "Re: [discuss] f_ops flag to speed up compatible ioctls in linux kernel":
> On Tue, Sep 07, 2004 at 05:45:43PM +0300, Michael S. Tsirkin wrote:
> > > > > but I cannot think of a good alternative.
> > > > >
> > > >
> > > > Maybe one entry point with a flag?
> > >
> > > That would be IMHO far uglier than two.
> > >
> > > -Andi
> > >
> >
> > What would be a good name? ioctl32/ioctl64? ioctl_compat/ioctl_native?
>
> Later two sound ok to me.
>
Wait, I think that a properly coded ioctl can always
figure out this is a compat call by looking at the command
(see example below).
So maybe we can live with just one new entry point with these
semantics?
MST
Example:
my_ioctl.h
//This structure size is different on 32 and 64 bit systems
struct my_foo {
long foobar;
};
#define FOO _IOR(MY_MAGIC,0,struct my_foo)
//
my_ioctl.c
struct my_foo32 {
int foobar;
};
#define FOO32 _IOR(MY_MAGIC,0,struct my_foo32)
static int ioctl_native (struct inode *inode, struct file *file, unsigned int
opcode, unsigned long udata_l);
static int ioctl_compat (struct inode *inode, struct file *file, unsigned int
opcode, unsigned long udata_l);
static int ioctl (struct inode *inode, struct file *file, unsigned int
opcode, unsigned long udata_l)
{
switch (opcode)
{
case FOO32:
return ioctl_compat(inode,file,opcode,udata_l);
case FOO:
default:
return ioctl_native(inode,file,opcode,udata_l);
}
}
Hello!
Quoting r. Andi Kleen ([email protected]) "Re: [discuss] f_ops flag to speed up compatible ioctls in linux kernel":
> On Tue, Sep 07, 2004 at 05:45:43PM +0300, Michael S. Tsirkin wrote:
> > > > > but I cannot think of a good alternative.
> > > > >
> > > >
> > > > Maybe one entry point with a flag?
> > >
> > > That would be IMHO far uglier than two.
> > >
> > > -Andi
> > >
> >
> > What would be a good name? ioctl32/ioctl64? ioctl_compat/ioctl_native?
>
> Later two sound ok to me.
>
> -Andi
Why is FIOQSIZE in arch/x86_64/ia32/ia32_ioctl.c and not in
include/linux/compat_ioctl.h ?
It is as far as I can see returning simply loff_t which is 64 bit
on all architectures.
Thanks,
MST
> Why is FIOQSIZE in arch/x86_64/ia32/ia32_ioctl.c and not in
> include/linux/compat_ioctl.h ?
Probably historical reasons. The ioctls used to be not merged.
>
> It is as far as I can see returning simply loff_t which is 64 bit
> on all architectures.
It could be moved into the generic layer, I agree.
-Andi
> Wait, I think that a properly coded ioctl can always
> figure out this is a compat call by looking at the command
> (see example below).
> So maybe we can live with just one new entry point with these
> semantics?
I think two is cleaner. And needing 8 bytes more for a file_operations
structure is not exactly a catastrophe.
-Andi
Hello!
Quoting r. Andi Kleen ([email protected]) "Re: [discuss] f_ops flag to speed up compatible ioctls in linux kernel":
> > Wait, I think that a properly coded ioctl can always
> > figure out this is a compat call by looking at the command
> > (see example below).
> > So maybe we can live with just one new entry point with these
> > semantics?
>
> I think two is cleaner. And needing 8 bytes more for a file_operations
> structure is not exactly a catastrophe.
>
> -Andi
OK.
Here (below) is an attempt at a patch.
There are two new entrypoints in file operations (for native and compat ioctls)
both called without BKL being taken at any point.
Since with this approach I cant call sys_ioctl directly from compat_sys_ioctl,
I had to split the common code into a routine std_sys_ioctl.
This handles ioctls which are common for all files,
(mostly without BKL, too - which made it possible to remove compat
macros for these commands ) - and returns status indicating whether ioctl was
handled.
I declared it in linux/ioctl.h, let me know if that's a good place for her.
Boots fine, now to update the driver and check how this works.
I'll follow up, test some more and benchmark this, hopefully tomorrow.
Pls let me know what do you think.
MST
diff -ruwp linux-2.6.8.1/include/linux/fs.h linux-2.6.8.1-new/include/linux/fs.h
--- linux-2.6.8.1/include/linux/fs.h 2004-09-07 19:33:43.000000000 +0300
+++ linux-2.6.8.1-new/include/linux/fs.h 2004-09-08 07:18:20.000000000 +0300
@@ -879,6 +879,8 @@ struct file_operations {
int (*readdir) (struct file *, void *, filldir_t);
unsigned int (*poll) (struct file *, struct poll_table_struct *);
int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
+ int (*ioctl_native) (struct inode *, struct file *, unsigned int, unsigned long);
+ int (*ioctl_compat) (struct inode *, struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *);
diff -ruwp linux-2.6.8.1/include/linux/ioctl.h linux-2.6.8.1-new/include/linux/ioctl.h
--- linux-2.6.8.1/include/linux/ioctl.h 2004-08-14 13:54:50.000000000 +0300
+++ linux-2.6.8.1-new/include/linux/ioctl.h 2004-09-08 07:40:19.000000000 +0300
@@ -3,5 +3,11 @@
#include <asm/ioctl.h>
+/* Handles standard ioctl calls */
+struct file;
+int std_sys_ioctl(
+ unsigned int fd, unsigned int cmd, unsigned long arg,
+ struct file * filp, int* error);
+
#endif /* _LINUX_IOCTL_H */
diff -ruwp linux-2.6.8.1/fs/ioctl.c linux-2.6.8.1-new/fs/ioctl.c
--- linux-2.6.8.1/fs/ioctl.c 2004-09-07 19:30:28.000000000 +0300
+++ linux-2.6.8.1-new/fs/ioctl.c 2004-09-08 07:38:03.000000000 +0300
@@ -35,7 +35,9 @@ static int file_ioctl(struct file *filp,
if ((error = get_user(block, p)) != 0)
return error;
+ lock_kernel();
res = mapping->a_ops->bmap(mapping, block);
+ unlock_kernel();
return put_user(res, p);
}
case FIGETBSZ:
@@ -45,29 +47,21 @@ static int file_ioctl(struct file *filp,
case FIONREAD:
return put_user(i_size_read(inode) - filp->f_pos, p);
}
- if (filp->f_op && filp->f_op->ioctl)
- return filp->f_op->ioctl(inode, filp, cmd, arg);
return -ENOTTY;
}
-asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
+EXPORT_SYMBOL(std_sys_ioctl);
+int std_sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg,
+ struct file * filp, long* status)
{
- struct file * filp;
+ int on,error;
unsigned int flag;
- int on, error = -EBADF;
-
- filp = fget(fd);
- if (!filp)
- goto out;
-
error = security_file_ioctl(filp, cmd, arg);
if (error) {
- fput(filp);
- goto out;
+ *status=error;
+ return 0;
}
-
- lock_kernel();
switch (cmd) {
case FIOCLEX:
set_close_on_exec(fd, 1);
@@ -99,8 +93,11 @@ asmlinkage long sys_ioctl(unsigned int f
/* Did FASYNC state change ? */
if ((flag ^ filp->f_flags) & FASYNC) {
- if (filp->f_op && filp->f_op->fasync)
+ if (filp->f_op && filp->f_op->fasync) {
+ lock_kernel();
error = filp->f_op->fasync(fd, filp, on);
+ unlock_kernel();
+ }
else error = -ENOTTY;
}
if (error != 0)
@@ -124,13 +121,44 @@ asmlinkage long sys_ioctl(unsigned int f
break;
default:
error = -ENOTTY;
- if (S_ISREG(filp->f_dentry->d_inode->i_mode))
+ if (S_ISREG(filp->f_dentry->d_inode->i_mode)) {
error = file_ioctl(filp, cmd, arg);
- else if (filp->f_op && filp->f_op->ioctl)
- error = filp->f_op->ioctl(filp->f_dentry->d_inode, filp, cmd, arg);
}
+ if (error == -ENOTTY) return 1;
+ }
+ *status=error;
+ return 0;
+}
+
+
+
+asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
+{
+ struct file * filp;
+ int error = -EBADF;
+ int fput_needed;
+
+ filp = fget_light(fd,&fput_needed);
+ if (!filp)
+ goto out;
+
+ if (!std_sys_ioctl(fd,cmd,arg,filp,&error)) {
+ goto out;
+ }
+
+ if (filp->f_op && filp->f_op->ioctl_native)
+ error = filp->f_op->ioctl_native(
+ filp->f_dentry->d_inode,
+ filp, cmd, arg);
+ else if (filp->f_op && filp->f_op->ioctl) {
+ lock_kernel();
+ error = filp->f_op->ioctl(
+ filp->f_dentry->d_inode,
+ filp, cmd, arg);
unlock_kernel();
- fput(filp);
+ }
+
+ fput_light(filp,fput_needed);
out:
return error;
diff -ruwp linux-2.6.8.1/fs/compat.c linux-2.6.8.1-new/fs/compat.c
--- linux-2.6.8.1/fs/compat.c 2004-08-14 13:55:31.000000000 +0300
+++ linux-2.6.8.1-new/fs/compat.c 2004-09-08 07:33:51.000000000 +0300
@@ -387,13 +387,17 @@ asmlinkage long compat_sys_ioctl(unsigne
struct file * filp;
int error = -EBADF;
struct ioctl_trans *t;
+ int fput_needed;
- filp = fget(fd);
+ filp = fget_light(fd, &fput_needed);
if(!filp)
goto out2;
- if (!filp->f_op || !filp->f_op->ioctl) {
- error = sys_ioctl (fd, cmd, arg);
+ if (!std_sys_ioctl(fd,cmd,arg,filp,&error))
+ goto out;
+ else if (filp->f_op && filp->f_op->ioctl_compat) {
+ error = filp->f_op->ioctl_compat( filp->f_dentry->d_inode,
+ filp, cmd, arg);
goto out;
}
@@ -406,11 +410,12 @@ asmlinkage long compat_sys_ioctl(unsigne
if (t) {
if (t->handler) {
error = t->handler(fd, cmd, arg, filp);
- unlock_kernel();
- } else {
- unlock_kernel();
- error = sys_ioctl(fd, cmd, arg);
+ } else if (filp->f_op && filp->f_op->ioctl) {
+ error = filp->f_op->ioctl(
+ filp->f_dentry->d_inode,
+ filp, cmd, arg);
}
+ unlock_kernel();
} else {
unlock_kernel();
if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
@@ -446,7 +451,7 @@ asmlinkage long compat_sys_ioctl(unsigne
}
}
out:
- fput(filp);
+ fput_light(filp, fput_needed);
out2:
return error;
}
diff -ruwp linux-2.6.8.1/include/linux/compat_ioctl.h linux-2.6.8.1-new/include/linux/compat_ioctl.h
--- linux-2.6.8.1/include/linux/compat_ioctl.h 2004-08-14 13:56:23.000000000 +0300
+++ linux-2.6.8.1-new/include/linux/compat_ioctl.h 2004-09-07 20:19:24.000000000 +0300
@@ -54,15 +54,6 @@ COMPATIBLE_IOCTL(FBIOPUT_VSCREENINFO)
COMPATIBLE_IOCTL(FBIOPAN_DISPLAY)
COMPATIBLE_IOCTL(FBIOGET_CON2FBMAP)
COMPATIBLE_IOCTL(FBIOPUT_CON2FBMAP)
-/* Little f */
-COMPATIBLE_IOCTL(FIOCLEX)
-COMPATIBLE_IOCTL(FIONCLEX)
-COMPATIBLE_IOCTL(FIOASYNC)
-COMPATIBLE_IOCTL(FIONBIO)
-COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */
-/* 0x00 */
-COMPATIBLE_IOCTL(FIBMAP)
-COMPATIBLE_IOCTL(FIGETBSZ)
/* 0x03 -- HD/IDE ioctl's used by hdparm and friends.
* Some need translations, these do not.
*/
diff -ruwp linux-2.6.8.1/arch/x86_64/ia32/ia32_ioctl.c linux-2.6.8.1-new/arch/x86_64/ia32/ia32_ioctl.c
--- linux-2.6.8.1/arch/x86_64/ia32/ia32_ioctl.c 2004-08-14 13:55:32.000000000 +0300
+++ linux-2.6.8.1-new/arch/x86_64/ia32/ia32_ioctl.c 2004-09-07 20:19:05.000000000 +0300
@@ -188,7 +188,6 @@ COMPATIBLE_IOCTL(RTC_RD_TIME)
COMPATIBLE_IOCTL(RTC_SET_TIME)
COMPATIBLE_IOCTL(RTC_WKALM_SET)
COMPATIBLE_IOCTL(RTC_WKALM_RD)
-COMPATIBLE_IOCTL(FIOQSIZE)
/* And these ioctls need translation */
HANDLE_IOCTL(TIOCGDEV, tiocgdev)
On Wed, Sep 08, 2004 at 05:28:08PM +0300, Michael S. Tsirkin wrote:
> --- linux-2.6.8.1/include/linux/fs.h 2004-09-07 19:33:43.000000000 +0300
> +++ linux-2.6.8.1-new/include/linux/fs.h 2004-09-08 07:18:20.000000000 +0300
> @@ -879,6 +879,8 @@ struct file_operations {
> int (*readdir) (struct file *, void *, filldir_t);
> unsigned int (*poll) (struct file *, struct poll_table_struct *);
> int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
> + int (*ioctl_native) (struct inode *, struct file *, unsigned int, unsigned long);
> + int (*ioctl_compat) (struct inode *, struct file *, unsigned int, unsigned long);
Define these as long, not int. No need to waste 32 perfectly good bits on
64bit platforms.
The main thing missing is documentation. You need clear comments what
the locking rules are and what compat is good for.
And you should change the code style to follow Documentation/CodingStyle
Other than that it looks ok to me.
-Andi
Hello!
Quoting r. Andi Kleen ([email protected]) "Re: [patch] Re: [discuss] f_ops flag to speed up compatible ioctls in linux kernel":
> On Wed, Sep 08, 2004 at 05:28:08PM +0300, Michael S. Tsirkin wrote:
> > --- linux-2.6.8.1/include/linux/fs.h 2004-09-07 19:33:43.000000000 +0300
> > +++ linux-2.6.8.1-new/include/linux/fs.h 2004-09-08 07:18:20.000000000 +0300
> > @@ -879,6 +879,8 @@ struct file_operations {
> > int (*readdir) (struct file *, void *, filldir_t);
> > unsigned int (*poll) (struct file *, struct poll_table_struct *);
> > int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
> > + int (*ioctl_native) (struct inode *, struct file *, unsigned int, unsigned long);
> > + int (*ioctl_compat) (struct inode *, struct file *, unsigned int, unsigned long);
>
> Define these as long, not int. No need to waste 32 perfectly good bits on
> 64bit platforms.
I was just following ioctl.
And ioctl is the way it is because man IOCTL(2) defines ioctl as
int ioctl(int d, int request, ...);
So I wander what goes on here- the syscall returns a long but
libc cuts the high 32 bit?
Now that I think about it,for compat if you start returning 0 in low
32 bits you are unlike to get the effect you wanted ...
The ioctl_native could be changed but that would make it impossible
for compatible ioctls to just use the same pointer in both.
So what do you think - should I make just the native ioctl a long,
or both, and document that the high 32 bit are cut in the compat call?
> The main thing missing is documentation. You need clear comments what
> the locking rules are and what compat is good for.
Would these be best fit in the header file itself, or in a new
Documentation/ file?
> And you should change the code style to follow Documentation/CodingStyle
I'll go over it again. Something specific that I missed?
> Other than that it looks ok to me.
>
> -Andi
> So I wander what goes on here- the syscall returns a long but
> libc cuts the high 32 bit?
System calls are always long, otherwise the syscall exit code cannot
check properly for signal restarts.
glibc seems to indeed truncate.
>
> Now that I think about it,for compat if you start returning 0 in low
> 32 bits you are unlike to get the effect you wanted ...
> The ioctl_native could be changed but that would make it impossible
> for compatible ioctls to just use the same pointer in both.
>
> So what do you think - should I make just the native ioctl a long,
> or both, and document that the high 32 bit are cut in the compat call?
both + document.
>
> > The main thing missing is documentation. You need clear comments what
> > the locking rules are and what compat is good for.
>
> Would these be best fit in the header file itself, or in a new
> Documentation/ file?
Header file should be enough.
> > And you should change the code style to follow Documentation/CodingStyle
> I'll go over it again. Something specific that I missed?
e.g. your use of white space.
-Andi
On Tue, Sep 07, 2004 at 09:07:19PM +0300, Michael S. Tsirkin wrote:
> Hello!
> Quoting r. Herbert Poetzl ([email protected]) "Re: [discuss] f_ops flag to speed up compatible ioctls in linux kernel":
> > On Tue, Sep 07, 2004 at 04:29:46PM +0200, Andi Kleen wrote:
> > > On Tue, Sep 07, 2004 at 05:25:30PM +0300, Michael S. Tsirkin wrote:
> > > > > It may help your module, but won't solve the general problem shorter
> > > > > term.
> > > > But longer term it will be better, so why not go there?
> > > > Once the infrastructure is there, drivers will be able to be
> > > > migrated as required.
> > >
> > > I have no problems with that. You would need two new entry points:
> > > one 64bit one without BKL and a 32bit one also without BKL.
> > >
> > > I think there were some objections to this scheme in the past,
> > > but I cannot think of a good alternative.
> >
> > uhm, apologies for dropping in, for what exactly
> > are two (new) separate entry points needed?
> >
> > somehow lost context here ...
> >
> > TIA,
> > Herbert
>
> There are two uses BKL in the ioctl call path:
> 1. BKL is kept across the whole ioctl call
> 2. BKL is used to protect the compat hash lookup
>
> So ioctl_native is to let drivers declare they dont need the BKL
> in they ioctl.
>
> ioctl_compat is to let drivers declare
> they dont need the hash lookup so it can be replaced by direct call by pointer.
okay, thanks, I guess I got it now, just wondered why
there should be different implementations for 32 and
64 bit, but a different semantic explains it ...
thanks,
Herbert
> MST
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
Hello!
Quoting r. Andi Kleen ([email protected]) "Re: [patch] Re: [discuss] f_ops flag to speed up compatible ioctls in linux kernel":
> > So I wander what goes on here- the syscall returns a long but
> > libc cuts the high 32 bit?
>
> System calls are always long, otherwise the syscall exit code cannot
> check properly for signal restarts.
>
> glibc seems to indeed truncate.
>
> >
> > Now that I think about it,for compat if you start returning 0 in low
> > 32 bits you are unlike to get the effect you wanted ...
> > The ioctl_native could be changed but that would make it impossible
> > for compatible ioctls to just use the same pointer in both.
> >
> > So what do you think - should I make just the native ioctl a long,
> > or both, and document that the high 32 bit are cut in the compat call?
>
> both + document.
Given that libc truncates the high 32 bit, that compat call can only use
low 32 bit, I begin to really think its better to leave this as int
and avoid the whole issue.
Additional advantage is in keeping
the exact same interface as ioctl is that its easier to change a driver
to use this interface - just assign the call in field, no other changes.
MST
Hello!
Quoting r. Andi Kleen ([email protected]) "Re: [patch] Re: [discuss] f_ops flag to speed up compatible ioctls in linux kernel":
> On Wed, Sep 08, 2004 at 05:28:08PM +0300, Michael S. Tsirkin wrote:
> > --- linux-2.6.8.1/include/linux/fs.h 2004-09-07 19:33:43.000000000 +0300
> > +++ linux-2.6.8.1-new/include/linux/fs.h 2004-09-08 07:18:20.000000000 +0300
> > @@ -879,6 +879,8 @@ struct file_operations {
> > int (*readdir) (struct file *, void *, filldir_t);
> > unsigned int (*poll) (struct file *, struct poll_table_struct *);
> > int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
> > + int (*ioctl_native) (struct inode *, struct file *, unsigned int, unsigned long);
> > + int (*ioctl_compat) (struct inode *, struct file *, unsigned int, unsigned long);
>
> Define these as long, not int. No need to waste 32 perfectly good bits on
> 64bit platforms.
>
> The main thing missing is documentation. You need clear comments what
> the locking rules are and what compat is good for.
>
> And you should change the code style to follow Documentation/CodingStyle
>
> Other than that it looks ok to me.
>
> -Andi
OK, here (below) is an updated version. I added a bit of documentation
and fixed two coding style issues that I found, if there are more issues
please let me know.
There are two new calls done without taking the BKL at any point -
for native and compat ioctls. A separate call for compat ioctl
makes it possible to avoid the (IMO ugly) hash lookup altogether.
I made the calls return long although this means a driver can not just
reuse hos old "ioctl" function - the return type has to be changed.
Otherwise ioctl_native/ioctl_compat are a drop-in replacement.
Toy benchmark: ioctl does a switch and takes a semaphore (note
dual processor results may be more drastic of there is no semaphore to
serialise on).
single process test:
before:
time /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
0.592u 8.313s 0:08.91 99.8% 0+0k 0+0io 0pf+0w
time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
0.669u 5.684s 0:06.36 99.6% 0+0k 0+0io 0pf+0w
after:
time /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
0.749u 5.133s 0:05.91 99.3% 0+0k 0+0io 1pf+0w
time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
0.583u 5.424s 0:06.01 99.8% 0+0k 0+0io 0pf+0w
System time reduced by 20% for 32 bit test.
Dual process (on dual CPU):
before:
time /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0 & ; time /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0 &
wait
[2] + Done /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
0.777u 28.376s 0:29.15 99.9% 0+0k 0+0io 0pf+0w
[1] + Done /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
1.113u 28.179s 0:29.32 99.8% 0+0k 0+0io 0pf+0w
time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0 & ; time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0 &
wait
[2] + Done /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
0.551u 12.367s 0:12.92 99.9% 0+0k 0+0io 0pf+0w
[1] + Done /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
0.554u 12.447s 0:13.01 99.8% 0+0k 0+0io 0pf+0w
after:
time /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
0.749u 5.133s 0:05.91 99.3% 0+0k 0+0io 1pf+0w
time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
0.583u 5.424s 0:06.01 99.8% 0+0k 0+0io 0pf+0w
time /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0 & ; time /tmp/ioctltest32
/dev/mst/mt23108_pci_cr0 &
wait
[2] + Done /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
0.626u 9.947s 0:10.60 99.6% 0+0k 0+0io 0pf+0w
[1] + Done /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
0.660u 10.039s 0:10.70 99.9% 0+0k 0+0io 0pf+0w
time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0 & ; time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0 &
wait
[2] + Done /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
0.473u 9.857s 0:10.37 99.5% 0+0k 0+0io 0pf+0w
[1] + Done /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
0.632u 9.813s 0:10.44 100.0% 0+0k 0+0io 0pf+0w
Almost 50% improvement for 32 bit code.
MST
diff -ru linux-2.6.8.1/include/linux/fs.h linux-2.6.8.1-new/include/linux/fs.h
--- linux-2.6.8.1/include/linux/fs.h 2004-09-14 21:20:43.000000000 +0300
+++ linux-2.6.8.1-new/include/linux/fs.h 2004-09-15 15:12:38.474626592 +0300
@@ -879,6 +879,22 @@
int (*readdir) (struct file *, void *, filldir_t);
unsigned int (*poll) (struct file *, struct poll_table_struct *);
int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
+ /* The following two calls are a replacement for the ioctl call above.
+ * They take precedence over ioctl - if set, ioctl will not be used.
+ * Unlike ioctl, BKL is not taken: drivers shall manage their own
+ * locking. */
+
+ /* If ioctl_native is set, it is used instead
+ * of ioctl for native ioctl syscalls.
+ * Note that standard glibc ioctl trims the return code to int,
+ * so dont try to put 64 bit there, unless you know what you are doing.
+ */
+ long (*ioctl_native) (struct inode *, struct file *, unsigned int, unsigned long);
+ /* If ioctl_compat is set, it is used for compatibility ioctl
+ * (i.e. a 32 bit binary running on a 64 bit OS).
+ * Note that only the low 32 bit of the return code are passed to the
+ * user-space application. */
+ long (*ioctl_compat) (struct inode *, struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *);
diff -ru linux-2.6.8.1/Documentation/filesystems/Locking linux-2.6.8.1-new/Documentation/filesystems/Locking
--- linux-2.6.8.1/Documentation/filesystems/Locking 2004-09-14 21:20:57.000000000 +0300
+++ linux-2.6.8.1-new/Documentation/filesystems/Locking 2004-09-15 15:23:35.149796792 +0300
@@ -331,6 +331,10 @@
unsigned int (*poll) (struct file *, struct poll_table_struct *);
int (*ioctl) (struct inode *, struct file *, unsigned int,
unsigned long);
+ long (*ioctl_native) (struct inode *, struct file *, unsigned int,
+ unsigned long);
+ long (*ioctl_compat) (struct inode *, struct file *, unsigned int,
+ unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *);
@@ -364,6 +368,8 @@
readdir: no
poll: no
ioctl: yes (see below)
+ioctl_native: no (see below)
+ioctl_compat: no (see below)
mmap: no
open: maybe (see below)
flush: no
@@ -409,6 +415,9 @@
anything that resembles union-mount we won't have a struct file for all
components. And there are other reasons why the current interface is a mess...
+->ioctl() on regular files is superceded by the ->ioctl_native() and
+->ioctl_compat() pair. The lock is not taken for these new calls.
+
->read on directories probably must go away - we should just enforce -EISDIR
in sys_read() and friends.
diff -ru linux-2.6.8.1/fs/compat.c linux-2.6.8.1-new/fs/compat.c
--- linux-2.6.8.1/fs/compat.c 2004-09-14 21:20:56.000000000 +0300
+++ linux-2.6.8.1-new/fs/compat.c 2004-09-15 15:15:04.384444928 +0300
@@ -385,15 +385,19 @@
unsigned long arg)
{
struct file * filp;
- int error = -EBADF;
+ long error = -EBADF;
struct ioctl_trans *t;
+ int fput_needed;
- filp = fget(fd);
+ filp = fget_light(fd, &fput_needed);
if(!filp)
goto out2;
- if (!filp->f_op || !filp->f_op->ioctl) {
- error = sys_ioctl (fd, cmd, arg);
+ if (!std_sys_ioctl(fd,cmd,arg,filp,&error))
+ goto out;
+ else if (filp->f_op && filp->f_op->ioctl_compat) {
+ error = filp->f_op->ioctl_compat( filp->f_dentry->d_inode,
+ filp, cmd, arg);
goto out;
}
@@ -406,11 +410,12 @@
if (t) {
if (t->handler) {
error = t->handler(fd, cmd, arg, filp);
- unlock_kernel();
- } else {
- unlock_kernel();
- error = sys_ioctl(fd, cmd, arg);
+ } else if (filp->f_op && filp->f_op->ioctl) {
+ error = filp->f_op->ioctl(
+ filp->f_dentry->d_inode,
+ filp, cmd, arg);
}
+ unlock_kernel();
} else {
unlock_kernel();
if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
@@ -446,7 +451,7 @@
}
}
out:
- fput(filp);
+ fput_light(filp, fput_needed);
out2:
return error;
}
diff -ru linux-2.6.8.1/fs/ioctl.c linux-2.6.8.1-new/fs/ioctl.c
--- linux-2.6.8.1/fs/ioctl.c 2004-09-14 21:20:56.000000000 +0300
+++ linux-2.6.8.1-new/fs/ioctl.c 2004-09-15 15:13:16.065911848 +0300
@@ -35,7 +35,9 @@
if ((error = get_user(block, p)) != 0)
return error;
+ lock_kernel();
res = mapping->a_ops->bmap(mapping, block);
+ unlock_kernel();
return put_user(res, p);
}
case FIGETBSZ:
@@ -45,29 +47,17 @@
case FIONREAD:
return put_user(i_size_read(inode) - filp->f_pos, p);
}
- if (filp->f_op && filp->f_op->ioctl)
- return filp->f_op->ioctl(inode, filp, cmd, arg);
return -ENOTTY;
}
-asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
- struct file * filp;
+EXPORT_SYMBOL(std_sys_ioctl);
+int std_sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg,
+ struct file * filp, int* error)
+{
+ int on;
unsigned int flag;
- int on, error = -EBADF;
-
- filp = fget(fd);
- if (!filp)
- goto out;
-
- error = security_file_ioctl(filp, cmd, arg);
- if (error) {
- fput(filp);
- goto out;
- }
-
- lock_kernel();
+ *error = security_file_ioctl(filp, cmd, arg);
switch (cmd) {
case FIOCLEX:
set_close_on_exec(fd, 1);
@@ -78,7 +68,7 @@
break;
case FIONBIO:
- if ((error = get_user(on, (int __user *)arg)) != 0)
+ if ((*error = get_user(on, (int __user *)arg)) != 0)
break;
flag = O_NONBLOCK;
#ifdef __sparc__
@@ -93,17 +83,20 @@
break;
case FIOASYNC:
- if ((error = get_user(on, (int __user *)arg)) != 0)
+ if ((*error = get_user(on, (int __user *)arg)) != 0)
break;
flag = on ? FASYNC : 0;
/* Did FASYNC state change ? */
if ((flag ^ filp->f_flags) & FASYNC) {
- if (filp->f_op && filp->f_op->fasync)
- error = filp->f_op->fasync(fd, filp, on);
- else error = -ENOTTY;
+ if (filp->f_op && filp->f_op->fasync) {
+ lock_kernel();
+ *error = filp->f_op->fasync(fd, filp, on);
+ unlock_kernel();
+ }
+ else *error = -ENOTTY;
}
- if (error != 0)
+ if (*error != 0)
break;
if (on)
@@ -117,20 +110,50 @@
S_ISREG(filp->f_dentry->d_inode->i_mode) ||
S_ISLNK(filp->f_dentry->d_inode->i_mode)) {
loff_t res = inode_get_bytes(filp->f_dentry->d_inode);
- error = copy_to_user((loff_t __user *)arg, &res, sizeof(res)) ? -EFAULT : 0;
+ *error = copy_to_user((loff_t __user *)arg, &res, sizeof(res)) ? -EFAULT : 0;
}
else
- error = -ENOTTY;
+ *error = -ENOTTY;
break;
default:
- error = -ENOTTY;
- if (S_ISREG(filp->f_dentry->d_inode->i_mode))
- error = file_ioctl(filp, cmd, arg);
- else if (filp->f_op && filp->f_op->ioctl)
- error = filp->f_op->ioctl(filp->f_dentry->d_inode, filp, cmd, arg);
+ *error = -ENOTTY;
+ if (S_ISREG(filp->f_dentry->d_inode->i_mode)) {
+ *error = file_ioctl(filp, cmd, arg);
+ }
+ if (*error == -ENOTTY) return 1;
}
- unlock_kernel();
- fput(filp);
+ return 0;
+}
+
+
+
+asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
+{
+ struct file * filp;
+ long error = -EBADF;
+ int fput_needed;
+
+ filp = fget_light(fd,&fput_needed);
+ if (!filp)
+ goto out;
+
+ if (!std_sys_ioctl(fd,cmd,arg,filp,&error)) {
+ goto out;
+ }
+
+ if (filp->f_op && filp->f_op->ioctl_native)
+ error = filp->f_op->ioctl_native(
+ filp->f_dentry->d_inode,
+ filp, cmd, arg);
+ else if (filp->f_op && filp->f_op->ioctl) {
+ lock_kernel();
+ error = filp->f_op->ioctl(
+ filp->f_dentry->d_inode,
+ filp, cmd, arg);
+ unlock_kernel();
+ }
+
+ fput_light(filp,fput_needed);
out:
return error;
diff -ru linux-2.6.8.1/include/linux/ioctl.h linux-2.6.8.1-new/include/linux/ioctl.h
--- linux-2.6.8.1/include/linux/ioctl.h 2004-09-14 21:20:43.000000000 +0300
+++ linux-2.6.8.1-new/include/linux/ioctl.h 2004-09-13 18:04:23.000000000 +0300
@@ -3,5 +3,10 @@
#include <asm/ioctl.h>
+/* Handles standard ioctl calls */
+struct file;
+int std_sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg,
+ struct file * filp, int* error);
+
#endif /* _LINUX_IOCTL_H */
diff -ru linux-2.6.8.1/include/linux/compat_ioctl.h linux-2.6.8.1-new/include/linux/compat_ioctl.h
--- linux-2.6.8.1/include/linux/compat_ioctl.h 2004-09-14 21:20:43.000000000 +0300
+++ linux-2.6.8.1-new/include/linux/compat_ioctl.h 2004-09-07 20:19:24.000000000 +0300
@@ -54,15 +54,6 @@
COMPATIBLE_IOCTL(FBIOPAN_DISPLAY)
COMPATIBLE_IOCTL(FBIOGET_CON2FBMAP)
COMPATIBLE_IOCTL(FBIOPUT_CON2FBMAP)
-/* Little f */
-COMPATIBLE_IOCTL(FIOCLEX)
-COMPATIBLE_IOCTL(FIONCLEX)
-COMPATIBLE_IOCTL(FIOASYNC)
-COMPATIBLE_IOCTL(FIONBIO)
-COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */
-/* 0x00 */
-COMPATIBLE_IOCTL(FIBMAP)
-COMPATIBLE_IOCTL(FIGETBSZ)
/* 0x03 -- HD/IDE ioctl's used by hdparm and friends.
* Some need translations, these do not.
*/
diff -ru linux-2.6.8.1/arch/x86_64/ia32/ia32_ioctl.c linux-2.6.8.1-new/arch/x86_64/ia32/ia32_ioctl.c
--- linux-2.6.8.1/arch/x86_64/ia32/ia32_ioctl.c 2004-09-14 21:20:50.000000000 +0300
+++ linux-2.6.8.1-new/arch/x86_64/ia32/ia32_ioctl.c 2004-09-07 20:19:05.000000000 +0300
@@ -188,7 +188,6 @@
COMPATIBLE_IOCTL(RTC_SET_TIME)
COMPATIBLE_IOCTL(RTC_WKALM_SET)
COMPATIBLE_IOCTL(RTC_WKALM_RD)
-COMPATIBLE_IOCTL(FIOQSIZE)
/* And these ioctls need translation */
HANDLE_IOCTL(TIOCGDEV, tiocgdev)
Here's a small update to the ioctl speedup patch (comment tweaks only).
Sorry for reposting the whole message, I do it in the hope
to present some context for the patch.
Feedback is welcome, I think most issues pointed out to me
earlier are resolved, if not let me know.
Summary:
I'm trying to add a fast ioctl path to drivers. Two issues
are addressed: the fact that current drivers need BKL
to be taken around the ioctl call, and the hash lookup for
the compat ioctl for 32 bit architectures.
There are two new calls done without taking the BKL at any point -
for native and compat ioctls. A separate call for compat ioctl
makes it possible to avoid the (IMO ugly) hash lookup altogether.
The assumption is drivers will gradually miggrate to this f_ops
and we'll be able to kill the old ioctl with the BKL completely at some point.
I made the calls return long although this means a driver can not just
reuse the old "ioctl" function - the return type has to be changed.
Otherwise ioctl_native/ioctl_compat are a drop-in replacement
for ioctl.
Toy benchmark: ioctl does a switch and takes a semaphore (note
dual processor results may be more drastic of there is no semaphore to
serialise on).
I have run the benchmark on a dual cpu
x86 64 bit system. I see a significant speedup for compat ioctls
and a slight speedup for native ioctls.
single process test:
before:
time /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
0.592u 8.313s 0:08.91 99.8% 0+0k 0+0io 0pf+0w
time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
0.669u 5.684s 0:06.36 99.6% 0+0k 0+0io 0pf+0w
after:
time /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
0.749u 5.133s 0:05.91 99.3% 0+0k 0+0io 1pf+0w
time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
0.583u 5.424s 0:06.01 99.8% 0+0k 0+0io 0pf+0w
System time reduced by 20% for 32 bit test.
Dual process (on dual CPU):
before:
time /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0 & ; time /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0 &
wait
[2] + Done /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
0.777u 28.376s 0:29.15 99.9% 0+0k 0+0io 0pf+0w
[1] + Done /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
1.113u 28.179s 0:29.32 99.8% 0+0k 0+0io 0pf+0w
time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0 & ; time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0 &
wait
[2] + Done /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
0.551u 12.367s 0:12.92 99.9% 0+0k 0+0io 0pf+0w
[1] + Done /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
0.554u 12.447s 0:13.01 99.8% 0+0k 0+0io 0pf+0w
after:
time /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
0.749u 5.133s 0:05.91 99.3% 0+0k 0+0io 1pf+0w
time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
0.583u 5.424s 0:06.01 99.8% 0+0k 0+0io 0pf+0w
time /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0 & ; time /tmp/ioctltest32
/dev/mst/mt23108_pci_cr0 &
wait
[2] + Done /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
0.626u 9.947s 0:10.60 99.6% 0+0k 0+0io 0pf+0w
[1] + Done /tmp/ioctltest32 /dev/mst/mt23108_pci_cr0
0.660u 10.039s 0:10.70 99.9% 0+0k 0+0io 0pf+0w
time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0 & ; time /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0 &
wait
[2] + Done /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
0.473u 9.857s 0:10.37 99.5% 0+0k 0+0io 0pf+0w
[1] + Done /tmp/ioctltest64 /dev/mst/mt23108_pci_cr0
0.632u 9.813s 0:10.44 100.0% 0+0k 0+0io 0pf+0w
Almost 50% improvement for 32 bit code.
MST
diff -ru linux-2.6.8.1/include/linux/fs.h linux-2.6.8.1-new/include/linux/fs.h
--- linux-2.6.8.1/include/linux/fs.h 2004-09-14 21:20:43.000000000 +0300
+++ linux-2.6.8.1-new/include/linux/fs.h 2004-09-15 15:12:38.474626592 +0300
@@ -879,6 +879,22 @@
int (*readdir) (struct file *, void *, filldir_t);
unsigned int (*poll) (struct file *, struct poll_table_struct *);
int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
+ /* The two calls ioctl_native and ioctl_compat described below can be
+ * used as a replacement for the ioctl call above. They take
+ * precedence over ioctl: thus if they are set, ioctl is not used.
+ * Unlike ioctl, BKL is not taken: drivers manage their own locking. */
+
+ /* If ioctl_native is set, it is used instead
+ * of ioctl for native ioctl syscalls.
+ * Note that the standard glibc ioctl trims the return code to type int,
+ * so dont try to put a 64 bit value there.
+ */
+ long (*ioctl_native) (struct inode *, struct file *, unsigned int, unsigned long);
+ /* If ioctl_compat is set, it is used for a 32 bit compatible ioctl
+ * (i.e. a 32 bit binary running on a 64 bit OS).
+ * Note that only the low 32 bit of the return code are passed to the
+ * user-space application. */
+ long (*ioctl_compat) (struct inode *, struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *);
diff -ru linux-2.6.8.1/Documentation/filesystems/Locking linux-2.6.8.1-new/Documentation/filesystems/Locking
--- linux-2.6.8.1/Documentation/filesystems/Locking 2004-09-14 21:20:57.000000000 +0300
+++ linux-2.6.8.1-new/Documentation/filesystems/Locking 2004-09-15 15:23:35.149796792 +0300
@@ -331,6 +331,10 @@
unsigned int (*poll) (struct file *, struct poll_table_struct *);
int (*ioctl) (struct inode *, struct file *, unsigned int,
unsigned long);
+ long (*ioctl_native) (struct inode *, struct file *, unsigned int,
+ unsigned long);
+ long (*ioctl_compat) (struct inode *, struct file *, unsigned int,
+ unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *);
@@ -364,6 +368,8 @@
readdir: no
poll: no
ioctl: yes (see below)
+ioctl_native: no (see below)
+ioctl_compat: no (see below)
mmap: no
open: maybe (see below)
flush: no
@@ -409,6 +415,9 @@
anything that resembles union-mount we won't have a struct file for all
components. And there are other reasons why the current interface is a mess...
+->ioctl() on regular files is superceded by the ->ioctl_native() and
+->ioctl_compat() pair. The lock is not taken for these new calls.
+
->read on directories probably must go away - we should just enforce -EISDIR
in sys_read() and friends.
diff -ru linux-2.6.8.1/fs/compat.c linux-2.6.8.1-new/fs/compat.c
--- linux-2.6.8.1/fs/compat.c 2004-09-14 21:20:56.000000000 +0300
+++ linux-2.6.8.1-new/fs/compat.c 2004-09-15 15:15:04.384444928 +0300
@@ -385,15 +385,19 @@
unsigned long arg)
{
struct file * filp;
- int error = -EBADF;
+ long error = -EBADF;
struct ioctl_trans *t;
+ int fput_needed;
- filp = fget(fd);
+ filp = fget_light(fd, &fput_needed);
if(!filp)
goto out2;
- if (!filp->f_op || !filp->f_op->ioctl) {
- error = sys_ioctl (fd, cmd, arg);
+ if (!std_sys_ioctl(fd,cmd,arg,filp,&error))
+ goto out;
+ else if (filp->f_op && filp->f_op->ioctl_compat) {
+ error = filp->f_op->ioctl_compat( filp->f_dentry->d_inode,
+ filp, cmd, arg);
goto out;
}
@@ -406,11 +410,12 @@
if (t) {
if (t->handler) {
error = t->handler(fd, cmd, arg, filp);
- unlock_kernel();
- } else {
- unlock_kernel();
- error = sys_ioctl(fd, cmd, arg);
+ } else if (filp->f_op && filp->f_op->ioctl) {
+ error = filp->f_op->ioctl(
+ filp->f_dentry->d_inode,
+ filp, cmd, arg);
}
+ unlock_kernel();
} else {
unlock_kernel();
if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
@@ -446,7 +451,7 @@
}
}
out:
- fput(filp);
+ fput_light(filp, fput_needed);
out2:
return error;
}
diff -ru linux-2.6.8.1/fs/ioctl.c linux-2.6.8.1-new/fs/ioctl.c
--- linux-2.6.8.1/fs/ioctl.c 2004-09-14 21:20:56.000000000 +0300
+++ linux-2.6.8.1-new/fs/ioctl.c 2004-09-15 15:13:16.065911848 +0300
@@ -35,7 +35,9 @@
if ((error = get_user(block, p)) != 0)
return error;
+ lock_kernel();
res = mapping->a_ops->bmap(mapping, block);
+ unlock_kernel();
return put_user(res, p);
}
case FIGETBSZ:
@@ -45,29 +47,17 @@
case FIONREAD:
return put_user(i_size_read(inode) - filp->f_pos, p);
}
- if (filp->f_op && filp->f_op->ioctl)
- return filp->f_op->ioctl(inode, filp, cmd, arg);
return -ENOTTY;
}
-asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
- struct file * filp;
+EXPORT_SYMBOL(std_sys_ioctl);
+int std_sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg,
+ struct file * filp, int* error)
+{
+ int on;
unsigned int flag;
- int on, error = -EBADF;
-
- filp = fget(fd);
- if (!filp)
- goto out;
-
- error = security_file_ioctl(filp, cmd, arg);
- if (error) {
- fput(filp);
- goto out;
- }
-
- lock_kernel();
+ *error = security_file_ioctl(filp, cmd, arg);
switch (cmd) {
case FIOCLEX:
set_close_on_exec(fd, 1);
@@ -78,7 +68,7 @@
break;
case FIONBIO:
- if ((error = get_user(on, (int __user *)arg)) != 0)
+ if ((*error = get_user(on, (int __user *)arg)) != 0)
break;
flag = O_NONBLOCK;
#ifdef __sparc__
@@ -93,17 +83,20 @@
break;
case FIOASYNC:
- if ((error = get_user(on, (int __user *)arg)) != 0)
+ if ((*error = get_user(on, (int __user *)arg)) != 0)
break;
flag = on ? FASYNC : 0;
/* Did FASYNC state change ? */
if ((flag ^ filp->f_flags) & FASYNC) {
- if (filp->f_op && filp->f_op->fasync)
- error = filp->f_op->fasync(fd, filp, on);
- else error = -ENOTTY;
+ if (filp->f_op && filp->f_op->fasync) {
+ lock_kernel();
+ *error = filp->f_op->fasync(fd, filp, on);
+ unlock_kernel();
+ }
+ else *error = -ENOTTY;
}
- if (error != 0)
+ if (*error != 0)
break;
if (on)
@@ -117,20 +110,50 @@
S_ISREG(filp->f_dentry->d_inode->i_mode) ||
S_ISLNK(filp->f_dentry->d_inode->i_mode)) {
loff_t res = inode_get_bytes(filp->f_dentry->d_inode);
- error = copy_to_user((loff_t __user *)arg, &res, sizeof(res)) ? -EFAULT : 0;
+ *error = copy_to_user((loff_t __user *)arg, &res, sizeof(res)) ? -EFAULT : 0;
}
else
- error = -ENOTTY;
+ *error = -ENOTTY;
break;
default:
- error = -ENOTTY;
- if (S_ISREG(filp->f_dentry->d_inode->i_mode))
- error = file_ioctl(filp, cmd, arg);
- else if (filp->f_op && filp->f_op->ioctl)
- error = filp->f_op->ioctl(filp->f_dentry->d_inode, filp, cmd, arg);
+ *error = -ENOTTY;
+ if (S_ISREG(filp->f_dentry->d_inode->i_mode)) {
+ *error = file_ioctl(filp, cmd, arg);
+ }
+ if (*error == -ENOTTY) return 1;
}
- unlock_kernel();
- fput(filp);
+ return 0;
+}
+
+
+
+asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
+{
+ struct file * filp;
+ long error = -EBADF;
+ int fput_needed;
+
+ filp = fget_light(fd,&fput_needed);
+ if (!filp)
+ goto out;
+
+ if (!std_sys_ioctl(fd,cmd,arg,filp,&error)) {
+ goto out;
+ }
+
+ if (filp->f_op && filp->f_op->ioctl_native)
+ error = filp->f_op->ioctl_native(
+ filp->f_dentry->d_inode,
+ filp, cmd, arg);
+ else if (filp->f_op && filp->f_op->ioctl) {
+ lock_kernel();
+ error = filp->f_op->ioctl(
+ filp->f_dentry->d_inode,
+ filp, cmd, arg);
+ unlock_kernel();
+ }
+
+ fput_light(filp,fput_needed);
out:
return error;
diff -ru linux-2.6.8.1/include/linux/ioctl.h linux-2.6.8.1-new/include/linux/ioctl.h
--- linux-2.6.8.1/include/linux/ioctl.h 2004-09-14 21:20:43.000000000 +0300
+++ linux-2.6.8.1-new/include/linux/ioctl.h 2004-09-13 18:04:23.000000000 +0300
@@ -3,5 +3,10 @@
#include <asm/ioctl.h>
+/* Handles standard ioctl calls */
+struct file;
+int std_sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg,
+ struct file * filp, int* error);
+
#endif /* _LINUX_IOCTL_H */
diff -ru linux-2.6.8.1/include/linux/compat_ioctl.h linux-2.6.8.1-new/include/linux/compat_ioctl.h
--- linux-2.6.8.1/include/linux/compat_ioctl.h 2004-09-14 21:20:43.000000000 +0300
+++ linux-2.6.8.1-new/include/linux/compat_ioctl.h 2004-09-07 20:19:24.000000000 +0300
@@ -54,15 +54,6 @@
COMPATIBLE_IOCTL(FBIOPAN_DISPLAY)
COMPATIBLE_IOCTL(FBIOGET_CON2FBMAP)
COMPATIBLE_IOCTL(FBIOPUT_CON2FBMAP)
-/* Little f */
-COMPATIBLE_IOCTL(FIOCLEX)
-COMPATIBLE_IOCTL(FIONCLEX)
-COMPATIBLE_IOCTL(FIOASYNC)
-COMPATIBLE_IOCTL(FIONBIO)
-COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */
-/* 0x00 */
-COMPATIBLE_IOCTL(FIBMAP)
-COMPATIBLE_IOCTL(FIGETBSZ)
/* 0x03 -- HD/IDE ioctl's used by hdparm and friends.
* Some need translations, these do not.
*/
diff -ru linux-2.6.8.1/arch/x86_64/ia32/ia32_ioctl.c linux-2.6.8.1-new/arch/x86_64/ia32/ia32_ioctl.c
--- linux-2.6.8.1/arch/x86_64/ia32/ia32_ioctl.c 2004-09-14 21:20:50.000000000 +0300
+++ linux-2.6.8.1-new/arch/x86_64/ia32/ia32_ioctl.c 2004-09-07 20:19:05.000000000 +0300
@@ -188,7 +188,6 @@
COMPATIBLE_IOCTL(RTC_SET_TIME)
COMPATIBLE_IOCTL(RTC_WKALM_SET)
COMPATIBLE_IOCTL(RTC_WKALM_RD)
-COMPATIBLE_IOCTL(FIOQSIZE)
/* And these ioctls need translation */
HANDLE_IOCTL(TIOCGDEV, tiocgdev)
----- End forwarded message -----