2004-06-28 08:08:34

by William Lee Irwin III

[permalink] [raw]
Subject: kiocb->private is too large for kiocb's on-stack

sizeof(struct kiocb) is dangerously large for a structure commonly
allocated on-stack. This patch converts the 24*sizeof(long) field,
->private, to a void pointer for use by file_operations entrypoints.
A ->dtor() method is added to the kiocb in order to support the release
of dynamically allocated structures referred to by ->private.

The sole in-tree users of ->private are async network read/write,
which are not, in fact, async, and so need not handle preallocated
->private as they would need to if ->ki_retry were ever used. The sole
truly async operations are direct IO pread()/pwrite() which do not
now use ->ki_retry(). All they would need to do in that case is to
check for ->private already being allocated for async kiocbs.

This rips 88B off the stack on 32-bit in the common case.

vs. 2.6.7-mm3

Index: mm3-2.6.7/include/net/sock.h
===================================================================
--- mm3-2.6.7.orig/include/net/sock.h 2004-06-27 15:44:42.365168400 -0700
+++ mm3-2.6.7/include/net/sock.h 2004-06-27 15:50:40.651700568 -0700
@@ -617,17 +617,17 @@
struct scm_cookie *scm;
struct msghdr *msg, async_msg;
struct iovec async_iov;
+ struct kiocb *kiocb;
};

static inline struct sock_iocb *kiocb_to_siocb(struct kiocb *iocb)
{
- BUG_ON(sizeof(struct sock_iocb) > KIOCB_PRIVATE_SIZE);
return (struct sock_iocb *)iocb->private;
}

static inline struct kiocb *siocb_to_kiocb(struct sock_iocb *si)
{
- return container_of((void *)si, struct kiocb, private);
+ return si->kiocb;
}

struct socket_alloc {
Index: mm3-2.6.7/include/linux/aio.h
===================================================================
--- mm3-2.6.7.orig/include/linux/aio.h 2004-06-15 22:18:54.000000000 -0700
+++ mm3-2.6.7/include/linux/aio.h 2004-06-27 15:50:40.654700112 -0700
@@ -23,8 +23,6 @@

#define KIOCB_SYNC_KEY (~0U)

-#define KIOCB_PRIVATE_SIZE (24 * sizeof(long))
-
/* ki_flags bits */
#define KIF_LOCKED 0
#define KIF_KICKED 1
@@ -55,6 +53,7 @@
struct kioctx *ki_ctx; /* may be NULL for sync ops */
int (*ki_cancel)(struct kiocb *, struct io_event *);
long (*ki_retry)(struct kiocb *);
+ void (*ki_dtor)(struct kiocb *);

struct list_head ki_list; /* the aio core uses this
* for cancellation */
@@ -65,8 +64,7 @@
} ki_obj;
__u64 ki_user_data; /* user's data for completion */
loff_t ki_pos;
-
- char private[KIOCB_PRIVATE_SIZE];
+ void *private;
};

#define is_sync_kiocb(iocb) ((iocb)->ki_key == KIOCB_SYNC_KEY)
@@ -79,6 +77,7 @@
(x)->ki_filp = (filp); \
(x)->ki_ctx = &tsk->active_mm->default_kioctx; \
(x)->ki_cancel = NULL; \
+ (x)->ki_dtor = NULL; \
(x)->ki_obj.tsk = tsk; \
} while (0)

Index: mm3-2.6.7/net/socket.c
===================================================================
--- mm3-2.6.7.orig/net/socket.c 2004-06-15 22:19:13.000000000 -0700
+++ mm3-2.6.7/net/socket.c 2004-06-27 15:50:40.662698896 -0700
@@ -548,9 +548,11 @@
int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
struct kiocb iocb;
+ struct sock_iocb siocb;
int ret;

init_sync_kiocb(&iocb, NULL);
+ iocb.private = &siocb;
ret = __sock_sendmsg(&iocb, sock, msg, size);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&iocb);
@@ -581,15 +583,22 @@
size_t size, int flags)
{
struct kiocb iocb;
+ struct sock_iocb siocb;
int ret;

init_sync_kiocb(&iocb, NULL);
+ iocb.private = &siocb;
ret = __sock_recvmsg(&iocb, sock, msg, size, flags);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&iocb);
return ret;
}

+static void sock_aio_dtor(struct kiocb *iocb)
+{
+ kfree(iocb->private);
+}
+
/*
* Read data from a socket. ubuf is a user mode pointer. We make sure the user
* area ubuf...ubuf+size-1 is writable before asking the protocol.
@@ -598,7 +607,7 @@
static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf,
size_t size, loff_t pos)
{
- struct sock_iocb *x = kiocb_to_siocb(iocb);
+ struct sock_iocb *x, siocb;
struct socket *sock;
int flags;

@@ -607,6 +616,16 @@
if (size==0) /* Match SYS5 behaviour */
return 0;

+ if (is_sync_kiocb(iocb))
+ x = &siocb;
+ else {
+ x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL);
+ if (!x)
+ return -ENOMEM;
+ iocb->ki_dtor = sock_aio_dtor;
+ }
+ iocb->private = x;
+ x->kiocb = iocb;
sock = SOCKET_I(iocb->ki_filp->f_dentry->d_inode);

x->async_msg.msg_name = NULL;
@@ -631,7 +650,7 @@
static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf,
size_t size, loff_t pos)
{
- struct sock_iocb *x = kiocb_to_siocb(iocb);
+ struct sock_iocb *x, siocb;
struct socket *sock;

if (pos != 0)
@@ -639,6 +658,16 @@
if(size==0) /* Match SYS5 behaviour */
return 0;

+ if (is_sync_kiocb(iocb))
+ x = &siocb;
+ else {
+ x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL);
+ if (!x)
+ return -ENOMEM;
+ iocb->ki_dtor = sock_aio_dtor;
+ }
+ iocb->private = x;
+ x->kiocb = iocb;
sock = SOCKET_I(iocb->ki_filp->f_dentry->d_inode);

x->async_msg.msg_name = NULL;
Index: mm3-2.6.7/fs/aio.c
===================================================================
--- mm3-2.6.7.orig/fs/aio.c 2004-06-27 15:44:54.278357320 -0700
+++ mm3-2.6.7/fs/aio.c 2004-06-27 15:50:40.667698136 -0700
@@ -396,6 +396,8 @@
req->ki_cancel = NULL;
req->ki_retry = NULL;
req->ki_obj.user = NULL;
+ req->ki_dtor = NULL;
+ req->private = NULL;

/* Check if the completion queue has enough free space to
* accept an event from this io.
@@ -436,9 +438,13 @@

static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
{
+ if (req->ki_dtor)
+ req->ki_dtor(req);
req->ki_ctx = NULL;
req->ki_filp = NULL;
req->ki_obj.user = NULL;
+ req->ki_dtor = NULL;
+ req->private = NULL;
kmem_cache_free(kiocb_cachep, req);
ctx->reqs_active--;


2004-06-28 08:13:38

by Andrew Morton

[permalink] [raw]
Subject: Re: kiocb->private is too large for kiocb's on-stack

William Lee Irwin III <[email protected]> wrote:
>
> sizeof(struct kiocb) is dangerously large for a structure commonly
> allocated on-stack. This patch converts the 24*sizeof(long) field,
> ->private, to a void pointer for use by file_operations entrypoints.
> A ->dtor() method is added to the kiocb in order to support the release
> of dynamically allocated structures referred to by ->private.
>
> The sole in-tree users of ->private are async network read/write,
> which are not, in fact, async, and so need not handle preallocated
> ->private as they would need to if ->ki_retry were ever used. The sole
> truly async operations are direct IO pread()/pwrite() which do not
> now use ->ki_retry(). All they would need to do in that case is to
> check for ->private already being allocated for async kiocbs.
>
> This rips 88B off the stack on 32-bit in the common case.
>

> int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
> {
> struct kiocb iocb;
> + struct sock_iocb siocb;
> int ret;
>
> init_sync_kiocb(&iocb, NULL);
> + iocb.private = &siocb;
> ret = __sock_sendmsg(&iocb, sock, msg, size);
> if (-EIOCBQUEUED == ret)
> ret = wait_on_sync_kiocb(&iocb);

That's so much better than what we had before it ain't funny.

Was this runtime tested?

2004-06-28 08:20:22

by William Lee Irwin III

[permalink] [raw]
Subject: Re: kiocb->private is too large for kiocb's on-stack

On Mon, Jun 28, 2004 at 01:12:32AM -0700, Andrew Morton wrote:
> That's so much better than what we had before it ain't funny.
> Was this runtime tested?

Yes. Oracle exercises this, and it survives OAST.

I'll write a dedicated userspace testcase for the aio operations and
follow up with that.


-- wli

2004-06-28 18:27:07

by David Miller

[permalink] [raw]
Subject: Re: kiocb->private is too large for kiocb's on-stack

On Mon, 28 Jun 2004 01:20:16 -0700
William Lee Irwin III <[email protected]> wrote:

> On Mon, Jun 28, 2004 at 01:12:32AM -0700, Andrew Morton wrote:
> > That's so much better than what we had before it ain't funny.
> > Was this runtime tested?
>
> Yes. Oracle exercises this, and it survives OAST.
>
> I'll write a dedicated userspace testcase for the aio operations and
> follow up with that.

This all looks fine to me. Andrew, would you like me to push this
patch along or did you already plan to take care of it.

Nice work William.

2004-06-29 02:02:31

by William Lee Irwin III

[permalink] [raw]
Subject: Re: kiocb->private is too large for kiocb's on-stack

/* On Mon, Jun 28, 2004 at 01:12:32AM -0700, Andrew Morton wrote:
>> That's so much better than what we had before it ain't funny.
>> Was this runtime tested?

On Mon, Jun 28, 2004 at 01:20:16AM -0700, William Lee Irwin III wrote:
> Yes. Oracle exercises this, and it survives OAST.
> I'll write a dedicated userspace testcase for the aio operations and
> follow up with that.

This is pretty damn mindless but seems to pound on the stuff. Maybe
something better can be devised (e.g. passing msgs around in a ring).


-- wli */

#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <string.h>
#include <limits.h>
#include <stdio.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <sys/epoll.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <linux/aio_abi.h>

#define die() \
do { \
int __err = errno; \
perror(__FUNCTION__); \
fprintf(stderr, "dead in %s() at %s:%d errno = %d\n", \
__FUNCTION__, __FILE__, __LINE__, __err); \
abort(); \
} while (0)

pid_t gettid(void)
{
return syscall(__NR_gettid);
}

long io_submit(aio_context_t context, long n, struct iocb **iocbs)
{
return syscall(__NR_io_submit, context, n, iocbs);
}

long io_setup(unsigned n, aio_context_t *context)
{
return syscall(__NR_io_setup, n, context);
}

long io_getevents(aio_context_t context, long min, long n,
struct io_event *events, struct timespec *ts)
{
return syscall(__NR_io_getevents, context, min, n, events, ts);
}

int epoll_add(int epfd, int fd, struct epoll_event *event)
{
return epoll_ctl(epfd, EPOLL_CTL_ADD, fd, event);
}

static void *client(void *arg)
{
int n = 0, *fds = arg;
pthread_t id = pthread_self();

while (1) {
write(fds[1], &id, sizeof(pthread_t));
read(fds[1], &id, sizeof(pthread_t));
++n;
if (!(n % (1 << 16)))
printf("boo from %d\n", gettid());
}
}

int main(int argc, char * const argv[])
{
long i, j, k, nr_clients;
pthread_t *clients;
int *svs, epfd;
struct epoll_event *events;
aio_context_t context = 0;
struct iocb *iocbs, **iocbps;
struct io_event *io_events;
pthread_t *bufs;

if (argc != 2)
return EXIT_FAILURE;
nr_clients = strtol(argv[1], NULL, 0);
if (nr_clients >= INT_MAX || nr_clients < 0)
return EXIT_FAILURE;
if ((size_t)nr_clients >= SIZE_MAX/sizeof(pthread_t))
return EXIT_FAILURE;
if ((size_t)nr_clients >= SIZE_MAX/(2*sizeof(int)))
return EXIT_FAILURE;
if ((size_t)nr_clients >= SIZE_MAX/(2*sizeof(struct iocb)))
return EXIT_FAILURE;
if ((size_t)nr_clients >= SIZE_MAX/(2*sizeof(struct iocb *)))
return EXIT_FAILURE;
if ((size_t)nr_clients >= SIZE_MAX/(2*sizeof(struct io_event)))
return EXIT_FAILURE;
if (!(bufs = calloc(nr_clients, 2*sizeof(pthread_t))))
return EXIT_FAILURE;
if (!(clients = calloc(nr_clients, sizeof(pthread_t)))) {
die();
goto free_bufs;
}
if (!(svs = calloc(nr_clients, 2*sizeof(int)))) {
die();
goto free_clients;
}
if (!(events = calloc(nr_clients, sizeof(struct epoll_event)))) {
die();
goto free_svs;
}
if (!(iocbs = calloc(nr_clients, 2*sizeof(struct iocb)))) {
die();
goto free_events;
}
if (!(iocbps = calloc(nr_clients, 2*sizeof(struct iocb *)))) {
die();
goto free_iocbs;
}
if (!(io_events = calloc(nr_clients, sizeof(struct io_event)))) {
die();
goto free_iocbps;
}
memset(clients, 0, sizeof(pthread_t)*nr_clients);
memset(svs, 0, 2*sizeof(int)*nr_clients);
memset(iocbs, 0, 2*sizeof(struct iocb)*nr_clients);
memset(iocbps, 0, 2*sizeof(struct iocb *)*nr_clients);
memset(io_events, 0, 2*sizeof(struct io_event)*nr_clients);
if ((epfd = epoll_create(nr_clients)) < 0) {
die();
goto free_io_events;
}
if (io_setup(nr_clients, &context)) {
die();
goto free_io_events;
}
for (i = 0; i < nr_clients; ++i) {
struct epoll_event event = {
.events = EPOLLIN|EPOLLET,
.data.ptr = &svs[2*i],
};
if (socketpair(AF_UNIX, SOCK_STREAM, 0, &svs[2*i])) {
die();
}
if (epoll_add(epfd, svs[2*i], &event))
die();
if (pthread_create(&clients[i], NULL, client, &svs[2*i]))
abort();
}
while (1) {
struct timespec ts = { .tv_sec = 0, .tv_nsec = 0, };
j = epoll_wait(epfd, events, nr_clients, 0);
k = io_getevents(context, 1, nr_clients, io_events, &ts);
for (i = 0; i < k; ++i) {
struct iocb *cb =
(struct iocb *)(unsigned long)io_events[i].obj;
cb->aio_data = 0;
if (cb->aio_lio_opcode == IOCB_CMD_PREAD)
bufs[2*(cb - iocbs)+1] = bufs[2*(cb - iocbs)];
}
for (i = 0; i < j; ++i) {
int m, n = ((int *)events[i].data.ptr - svs)/2;

m = (n + 1) % nr_clients;
bufs[2*i+1] = clients[m];
iocbs[2*n].aio_lio_opcode = IOCB_CMD_PREAD;
iocbs[2*n+1].aio_lio_opcode = IOCB_CMD_PWRITE;
iocbs[2*n].aio_fildes = svs[2*n];
iocbs[2*n+1].aio_fildes = svs[2*n];
iocbs[2*n].aio_buf = (unsigned long)&bufs[2*i];
iocbs[2*n+1].aio_buf = (unsigned long)&bufs[2*i+1];
iocbs[2*n].aio_data = (unsigned long)&bufs[2*i];
iocbs[2*n+1].aio_data = (unsigned long)&bufs[2*i+1];
iocbs[2*n].aio_nbytes = sizeof(pthread_t);
iocbs[2*n+1].aio_nbytes = sizeof(pthread_t);
iocbs[2*n].aio_offset = 0;
iocbs[2*n+1].aio_offset = 0;
iocbps[2*i] = &iocbs[2*n];
iocbps[2*i+1] = &iocbs[2*n+1];
}
if (j)
io_submit(context, 2*j, iocbps);
}
free_io_events:
free(io_events);
free_iocbps:
free(iocbps);
free_iocbs:
free(iocbs);
free_events:
free(events);
free_svs:
free(svs);
free_clients:
free(clients);
free_bufs:
free(bufs);
die();
}