2002-11-06 00:59:52

by Badari Pulavarty

[permalink] [raw]
Subject: [PATCH 2/2] 2.5.46 AIO support for raw/O_DIRECT

Hi,

This is (part 2/2) 2.5.46 patch to support AIO for raw/O_DIRECT.

This patch adds AIO support for DIO code path. This patch also
has a work around for calling set_page_dirty() from interrupt
context problem.

Andrew, could you please check to see if I did "set_page_dirty()"
hack (you suggested) correctly (in the right place) ?

Ben, could you pick these patches and push to Linus ?

NOTE: You need part 1/2 to use this patch.

Thanks,
Badari

diff -Naur -X dontdiff linux-2.5.46/fs/direct-io.c linux-2.5.46.aio/fs/direct-io.c
--- linux-2.5.46/fs/direct-io.c Tue Nov 5 16:01:18 2002
+++ linux-2.5.46.aio/fs/direct-io.c Tue Nov 5 14:57:36 2002
@@ -13,6 +13,7 @@
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/bio.h>
@@ -100,6 +101,11 @@
spinlock_t bio_list_lock; /* protects bio_list */
struct bio *bio_list; /* singly linked via bi_private */
struct task_struct *waiter; /* waiting task (NULL if none) */
+
+ /* AIO related stuff */
+ struct kiocb *iocb; /* kiocb */
+ int is_async; /* is IO async ? */
+ int result; /* IO result */
};

/*
@@ -176,6 +182,43 @@
return dio->pages[dio->head++];
}

+static void dio_bio_count(struct dio *dio)
+{
+ if (atomic_dec_and_test(&dio->bio_count)) {
+ if(dio->is_async) {
+ aio_complete(dio->iocb, dio->result, 0);
+ kfree(dio);
+ }
+ }
+}
+
+static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error)
+{
+ struct dio *dio = bio->bi_private;
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec;
+ int page_no;
+
+ if (bio->bi_size)
+ return 1;
+
+ for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
+ struct page *page = bvec[page_no].bv_page;
+
+ if (dio->rw == READ) {
+ SetPageDirty(page);
+ SetPageWrongList(page);
+ }
+ page_cache_release(page);
+ }
+ if (!uptodate)
+ dio->result = -EIO;
+
+ dio_bio_count(dio);
+ bio_put(bio);
+ return 0;
+}
+
/*
* The BIO completion handler simply queues the BIO up for the process-context
* handler.
@@ -212,7 +255,10 @@

bio->bi_bdev = bdev;
bio->bi_sector = first_sector;
- bio->bi_end_io = dio_bio_end_io;
+ if (dio->is_async)
+ bio->bi_end_io = dio_bio_end_aio;
+ else
+ bio->bi_end_io = dio_bio_end_io;

dio->bio = bio;
return 0;
@@ -745,73 +791,84 @@
}

static int
-direct_io_worker(int rw, struct inode *inode, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs, unsigned blkbits,
- get_blocks_t get_blocks)
+direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
+ const struct iovec *iov, loff_t offset, unsigned long nr_segs,
+ unsigned blkbits, get_blocks_t get_blocks)
{
unsigned long user_addr;
int seg, ret2, ret = 0;
- struct dio dio;
- size_t bytes, tot_bytes = 0;
+ struct dio local_dio, *dio;
+ size_t bytes;

- dio.bio = NULL;
- dio.inode = inode;
- dio.rw = rw;
- dio.blkbits = blkbits;
- dio.blkfactor = inode->i_blkbits - blkbits;
- dio.start_zero_done = 0;
- dio.block_in_file = offset >> blkbits;
- dio.blocks_available = 0;
-
- dio.cur_page = NULL;
-
- dio.boundary = 0;
- dio.reap_counter = 0;
- dio.get_blocks = get_blocks;
- dio.final_block_in_bio = -1;
- dio.next_block_for_io = -1;
+ if (is_sync_kiocb(iocb)) {
+ dio = &local_dio;
+ dio->is_async = 0;
+ } else {
+ dio = (struct dio *)kmalloc(sizeof(struct dio), GFP_KERNEL);
+ if (!dio)
+ return -ENOMEM;
+ dio->is_async = 1;
+ }
+ dio->bio = NULL;
+ dio->inode = inode;
+ dio->rw = rw;
+ dio->blkbits = blkbits;
+ dio->blkfactor = inode->i_blkbits - blkbits;
+ dio->start_zero_done = 0;
+ dio->block_in_file = offset >> blkbits;
+ dio->blocks_available = 0;

- dio.page_errors = 0;
+ dio->cur_page = NULL;
+
+ dio->boundary = 0;
+ dio->reap_counter = 0;
+ dio->get_blocks = get_blocks;
+ dio->final_block_in_bio = -1;
+ dio->next_block_for_io = -1;
+
+ dio->page_errors = 0;
+ dio->result = 0;
+ dio->iocb = iocb;

/* BIO completion state */
- atomic_set(&dio.bio_count, 0);
- spin_lock_init(&dio.bio_list_lock);
- dio.bio_list = NULL;
- dio.waiter = NULL;
- dio.pages_in_io = 0;
+ atomic_set(&dio->bio_count, 1);
+ spin_lock_init(&dio->bio_list_lock);
+ dio->bio_list = NULL;
+ dio->waiter = NULL;
+ dio->pages_in_io = 0;

for (seg = 0; seg < nr_segs; seg++)
- dio.pages_in_io += (iov[seg].iov_len >> blkbits) + 2;
+ dio->pages_in_io += (iov[seg].iov_len >> blkbits) + 2;

for (seg = 0; seg < nr_segs; seg++) {
user_addr = (unsigned long)iov[seg].iov_base;
bytes = iov[seg].iov_len;

/* Index into the first page of the first block */
- dio.first_block_in_page = (user_addr & (PAGE_SIZE - 1)) >> blkbits;
- dio.final_block_in_request = dio.block_in_file + (bytes >> blkbits);
+ dio->first_block_in_page = (user_addr & (PAGE_SIZE - 1)) >> blkbits;
+ dio->final_block_in_request = dio->block_in_file + (bytes >> blkbits);
/* Page fetching state */
- dio.head = 0;
- dio.tail = 0;
- dio.curr_page = 0;
+ dio->head = 0;
+ dio->tail = 0;
+ dio->curr_page = 0;

- dio.total_pages = 0;
+ dio->total_pages = 0;
if (user_addr & (PAGE_SIZE-1)) {
- dio.total_pages++;
+ dio->total_pages++;
bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
}
- dio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
- dio.curr_user_address = user_addr;
+ dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+ dio->curr_user_address = user_addr;

- ret = do_direct_IO(&dio);
+ ret = do_direct_IO(dio);

if (ret) {
- dio_cleanup(&dio);
+ dio_cleanup(dio);
break;
}

- tot_bytes += iov[seg].iov_len - ((dio.final_block_in_request -
- dio.block_in_file) << blkbits);
+ dio->result += iov[seg].iov_len - ((dio->final_block_in_request -
+ dio->block_in_file) << blkbits);

} /* end iovec loop */

@@ -819,22 +876,32 @@
* There may be some unwritten disk at the end of a part-written
* fs-block-sized block. Go zero that now.
*/
- dio_zero_block(&dio, 1);
+ dio_zero_block(dio, 1);

- if (dio.cur_page) {
- ret2 = dio_send_cur_page(&dio);
- page_cache_release(dio.cur_page);
+ if (dio->cur_page) {
+ ret2 = dio_send_cur_page(dio);
+ page_cache_release(dio->cur_page);
if (ret == 0)
ret = ret2;
}
- ret2 = dio_await_completion(&dio);
+
+ if (dio->is_async) {
+ dio_bio_count(dio);
+ if (ret == 0)
+ ret = -EIOCBQUEUED;
+ goto out;
+ }
+
+ dio_bio_count(dio);
+ ret2 = dio_await_completion(dio);
if (ret == 0)
ret = ret2;
if (ret == 0)
- ret = dio.page_errors;
+ ret = dio->page_errors;
if (ret == 0)
- ret = tot_bytes;
+ ret = dio->result;

+out:
return ret;
}

@@ -878,7 +945,7 @@
}
}

- retval = direct_io_worker(rw, inode, iov, offset,
+ retval = direct_io_worker(rw, iocb, inode, iov, offset,
nr_segs, blkbits, get_blocks);
out:
return retval;
diff -Naur -X dontdiff linux-2.5.46/include/linux/page-flags.h linux-2.5.46.aio/include/linux/page-flags.h
--- linux-2.5.46/include/linux/page-flags.h Mon Nov 4 14:30:37 2002
+++ linux-2.5.46.aio/include/linux/page-flags.h Tue Nov 5 14:56:44 2002
@@ -70,6 +70,7 @@
#define PG_chainlock 15 /* lock bit for ->pte_chain */

#define PG_direct 16 /* ->pte_chain points directly at pte */
+#define PG_wronglist 17 /* page is on wrong list */

/*
* Global page accounting. One instance per CPU. Only unsigned longs are
@@ -233,6 +234,10 @@
#define ClearPageDirect(page) clear_bit(PG_direct, &(page)->flags)
#define TestClearPageDirect(page) test_and_clear_bit(PG_direct, &(page)->flags)

+#define SetPageWrongList(page) set_bit(PG_wronglist, &(page)->flags)
+#define PageWrongList(page) test_bit(PG_wronglist, &(page)->flags)
+#define ClearPageWrongList(page) clear_bit(PG_wronglist, &(page)->flags)
+
/*
* The PageSwapCache predicate doesn't use a PG_flag at this time,
* but it may again do so one day.
diff -Naur -X dontdiff linux-2.5.46/mm/vmscan.c linux-2.5.46.aio/mm/vmscan.c
--- linux-2.5.46/mm/vmscan.c Mon Nov 4 14:30:07 2002
+++ linux-2.5.46.aio/mm/vmscan.c Tue Nov 5 14:58:56 2002
@@ -378,6 +378,12 @@
goto keep_locked;
}

+ if (PageWrongList(page)) {
+ if (TestClearPageDirty(page))
+ set_page_dirty(page);
+ ClearPageWrongList(page);
+ }
+
#ifdef CONFIG_SWAP
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page->index };


2002-11-06 01:11:05

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 2/2] 2.5.46 AIO support for raw/O_DIRECT

Badari Pulavarty wrote:
>
> Hi,
>
> This is (part 2/2) 2.5.46 patch to support AIO for raw/O_DIRECT.
>
> This patch adds AIO support for DIO code path. This patch also
> has a work around for calling set_page_dirty() from interrupt
> context problem.
>
> Andrew, could you please check to see if I did "set_page_dirty()"
> hack (you suggested) correctly (in the right place) ?
>

Looks like it. It's such a hack, I want to hide ;)

Sigh. I think I'd prefer to just go and make ->page_lock
and ->private_lock irq-safe.

Or not proceed with this patch at all. If this is to be the
only code which wishes to perform page list motion at interrupt
time, perhaps it's not justifiable?

I really don't have a feeling for how valuable this is, nor
do I know whether there will be other code which wants to
perform page list manipulation at interrupt time.

In fact I also don't know where the whole AIO thing sits at
present. Is it all done and finished? Is there more to come,
and if so, what??

2002-11-06 06:58:29

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [PATCH 2/2] 2.5.46 AIO support for raw/O_DIRECT

On Tue, Nov 05, 2002 at 05:17:32PM -0800, Andrew Morton wrote:
> Or not proceed with this patch at all. If this is to be the
> only code which wishes to perform page list motion at interrupt
> time, perhaps it's not justifiable?
>
> I really don't have a feeling for how valuable this is, nor
> do I know whether there will be other code which wants to
> perform page list manipulation at interrupt time.

I can think of a few other places that would like to perform page
motion from irq context: anything else doing zero copy or page
flipping, and more importantly the O(1) vm code that's being worked
on. The latter is actually quite important as we've got a number
of customers running into problems with some of the algorithms in
the 2.4 kernel where the kernel does not perform any list motion
from irq context and this results in excess cpu time spent traversing
lists to see if io has completed.

> In fact I also don't know where the whole AIO thing sits at
> present. Is it all done and finished? Is there more to come,
> and if so, what??

There's more to come. The bits I'm working on are running in kernel
context mainly to simplify the copy_*_user case since we don't have
full zero copy semantics available and coping with pinned pages is
a challenge in a multiuser system, plus it makes reusing the existing
networking code a lot easier. Basically, anything that involves a
copy of data is likely to be better implemented running in a task to
get the priority of execution correct, whereas anything involving
zero copy io is going to want completion from irq or bottom half
context and hence dirty pages. Does that make sense?

-ben
--
"Do you seek knowledge in time travel?"

2002-11-06 07:17:08

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 2/2] 2.5.46 AIO support for raw/O_DIRECT

Benjamin LaHaise wrote:
>
> On Tue, Nov 05, 2002 at 05:17:32PM -0800, Andrew Morton wrote:
> > Or not proceed with this patch at all. If this is to be the
> > only code which wishes to perform page list motion at interrupt
> > time, perhaps it's not justifiable?
> >
> > I really don't have a feeling for how valuable this is, nor
> > do I know whether there will be other code which wants to
> > perform page list manipulation at interrupt time.
>
> I can think of a few other places that would like to perform page
> motion from irq context: anything else doing zero copy or page
> flipping, and more importantly the O(1) vm code that's being worked
> on. The latter is actually quite important as we've got a number
> of customers running into problems with some of the algorithms in
> the 2.4 kernel where the kernel does not perform any list motion
> from irq context and this results in excess cpu time spent traversing
> lists to see if io has completed.

Interesting. Would it be possible to get more details on this
problem?

We can shuffle pages around on the LRU from interrupts at present.
But not the address_space lists and buffer stuff.

We could remove the address_space lists and page->list altogether
with a bit of radix tree work. I started in on that but got distracted.

mapping->clean_pages is actually obsolete now - we never walk it. It
could be replaced with list_del_init() everywhere.

> > In fact I also don't know where the whole AIO thing sits at
> > present. Is it all done and finished? Is there more to come,
> > and if so, what??
>
> There's more to come. The bits I'm working on are running in kernel
> context mainly to simplify the copy_*_user case since we don't have
> full zero copy semantics available and coping with pinned pages is
> a challenge in a multiuser system, plus it makes reusing the existing
> networking code a lot easier. Basically, anything that involves a
> copy of data is likely to be better implemented running in a task to
> get the priority of execution correct, whereas anything involving
> zero copy io is going to want completion from irq or bottom half
> context and hence dirty pages. Does that make sense?

OK, thanks. So I guess we make those locks irq-safe. I don't
expect that would be feasible in 2.4 because of the hold times in
truncate - it should be OK in 2.5. I'll do the patch.