LinuxLists.cc - [11/36] Use page_cache_xxx in fs/buffer.c

2007-08-28 19:15:57

by Christoph Lameter

[permalink] [raw]

Subject: [11/36] Use page_cache_xxx in fs/buffer.c

Use page_cache_xxx in fs/buffer.c.

We have a special situation in set_bh_page() since reiserfs calls that
function before setting up the mapping. So retrieve the page size
from the page struct rather than the mapping.

Signed-off-by: Christoph Lameter <[email protected]>
---
fs/buffer.c | 110 +++++++++++++++++++++++++++++++++---------------------------
1 file changed, 62 insertions(+), 48 deletions(-)

Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c 2007-08-28 11:37:13.000000000 -0700
+++ linux-2.6/fs/buffer.c 2007-08-28 11:37:58.000000000 -0700
@@ -257,7 +257,7 @@ __find_get_block_slow(struct block_devic
struct page *page;
int all_mapped = 1;

- index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
+ index = block >> (page_cache_shift(bd_mapping) - bd_inode->i_blkbits);
page = find_get_page(bd_mapping, index);
if (!page)
goto out;
@@ -697,7 +697,7 @@ static int __set_page_dirty(struct page

if (mapping_cap_account_dirty(mapping)) {
__inc_zone_page_state(page, NR_FILE_DIRTY);
- task_io_account_write(PAGE_CACHE_SIZE);
+ task_io_account_write(page_cache_size(mapping));
}
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
@@ -891,10 +891,11 @@ struct buffer_head *alloc_page_buffers(s
{
struct buffer_head *bh, *head;
long offset;
+ unsigned int page_size = page_cache_size(page->mapping);

try_again:
head = NULL;
- offset = PAGE_SIZE;
+ offset = page_size;
while ((offset -= size) >= 0) {
bh = alloc_buffer_head(GFP_NOFS);
if (!bh)
@@ -1426,7 +1427,7 @@ void set_bh_page(struct buffer_head *bh,
struct page *page, unsigned long offset)
{
bh->b_page = page;
- BUG_ON(offset >= PAGE_SIZE);
+ BUG_ON(offset >= compound_size(page));
if (PageHighMem(page))
/*
* This catches illegal uses and preserves the offset:
@@ -1605,6 +1606,7 @@ static int __block_write_full_page(struc
struct buffer_head *bh, *head;
const unsigned blocksize = 1 << inode->i_blkbits;
int nr_underway = 0;
+ struct address_space *mapping = inode->i_mapping;

BUG_ON(!PageLocked(page));

@@ -1625,7 +1627,8 @@ static int __block_write_full_page(struc
* handle that here by just cleaning them.
*/

- block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ block = (sector_t)page->index <<
+ (page_cache_shift(mapping) - inode->i_blkbits);
head = page_buffers(page);
bh = head;

@@ -1742,7 +1745,7 @@ recover:
} while ((bh = bh->b_this_page) != head);
SetPageError(page);
BUG_ON(PageWriteback(page));
- mapping_set_error(page->mapping, err);
+ mapping_set_error(mapping, err);
set_page_writeback(page);
do {
struct buffer_head *next = bh->b_this_page;
@@ -1767,8 +1770,8 @@ static int __block_prepare_write(struct
struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;

BUG_ON(!PageLocked(page));
- BUG_ON(from > PAGE_CACHE_SIZE);
- BUG_ON(to > PAGE_CACHE_SIZE);
+ BUG_ON(from > page_cache_size(inode->i_mapping));
+ BUG_ON(to > page_cache_size(inode->i_mapping));
BUG_ON(from > to);

blocksize = 1 << inode->i_blkbits;
@@ -1777,7 +1780,8 @@ static int __block_prepare_write(struct
head = page_buffers(page);

bbits = inode->i_blkbits;
- block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+ block = (sector_t)page->index <<
+ (page_cache_shift(inode->i_mapping) - bbits);

for(bh = head, block_start = 0; bh != head || !block_start;
block++, block_start=block_end, bh = bh->b_this_page) {
@@ -1921,7 +1925,8 @@ int block_read_full_page(struct page *pa
create_empty_buffers(page, blocksize, 0);
head = page_buffers(page);

- iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ iblock = (sector_t)page->index <<
+ (page_cache_shift(page->mapping) - inode->i_blkbits);
lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
bh = head;
nr = 0;
@@ -2045,7 +2050,7 @@ int generic_cont_expand(struct inode *in
pgoff_t index;
unsigned int offset;

- offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
+ offset = page_cache_offset(inode->i_mapping, size); /* Within page */

/* ugh. in prepare/commit_write, if from==to==start of block, we
** skip the prepare. make sure we never send an offset for the start
@@ -2055,7 +2060,7 @@ int generic_cont_expand(struct inode *in
/* caller must handle this extra byte. */
offset++;
}
- index = size >> PAGE_CACHE_SHIFT;
+ index = page_cache_index(inode->i_mapping, size);

return __generic_cont_expand(inode, size, index, offset);
}
@@ -2063,8 +2068,8 @@ int generic_cont_expand(struct inode *in
int generic_cont_expand_simple(struct inode *inode, loff_t size)
{
loff_t pos = size - 1;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
+ pgoff_t index = page_cache_index(inode->i_mapping, pos);
+ unsigned int offset = page_cache_offset(inode->i_mapping, pos) + 1;

/* prepare/commit_write can handle even if from==to==start of block. */
return __generic_cont_expand(inode, size, index, offset);
@@ -2086,28 +2091,28 @@ int cont_prepare_write(struct page *page
unsigned zerofrom;
unsigned blocksize = 1 << inode->i_blkbits;

- while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
+ while (page->index > (pgpos = page_cache_index(mapping, *bytes))) {
status = -ENOMEM;
new_page = grab_cache_page(mapping, pgpos);
if (!new_page)
goto out;
/* we might sleep */
- if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
+ if (page_cache_index(mapping, *bytes) != pgpos) {
unlock_page(new_page);
page_cache_release(new_page);
continue;
}
- zerofrom = *bytes & ~PAGE_CACHE_MASK;
+ zerofrom = page_cache_offset(mapping, *bytes);
if (zerofrom & (blocksize-1)) {
*bytes |= (blocksize-1);
(*bytes)++;
}
status = __block_prepare_write(inode, new_page, zerofrom,
- PAGE_CACHE_SIZE, get_block);
+ page_cache_size(mapping), get_block);
if (status)
goto out_unmap;
- zero_user_segment(new_page, zerofrom, PAGE_CACHE_SIZE);
- generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
+ zero_user_segment(new_page, zerofrom, page_cache_size(mapping));
+ generic_commit_write(NULL, new_page, zerofrom, page_cache_size(mapping));
unlock_page(new_page);
page_cache_release(new_page);
}
@@ -2117,7 +2122,7 @@ int cont_prepare_write(struct page *page
zerofrom = offset;
} else {
/* page covers the boundary, find the boundary offset */
- zerofrom = *bytes & ~PAGE_CACHE_MASK;
+ zerofrom = page_cache_offset(mapping, *bytes);

/* if we will expand the thing last block will be filled */
if (to > zerofrom && (zerofrom & (blocksize-1))) {
@@ -2169,8 +2174,9 @@ int block_commit_write(struct page *page
int generic_commit_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
- struct inode *inode = page->mapping->host;
- loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ loff_t pos = page_cache_pos(mapping, page->index, to);
__block_commit_write(inode,page,from,to);
/*
* No need to use i_size_read() here, the i_size
@@ -2206,20 +2212,22 @@ block_page_mkwrite(struct vm_area_struct
unsigned long end;
loff_t size;
int ret = -EINVAL;
+ struct address_space *mapping;

lock_page(page);
+ mapping = page->mapping;
size = i_size_read(inode);
- if ((page->mapping != inode->i_mapping) ||
+ if ((mapping != inode->i_mapping) ||
(page_offset(page) > size)) {
/* page got truncated out from underneath us */
goto out_unlock;
}

/* page is wholly or partially inside EOF */
- if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
- end = size & ~PAGE_CACHE_MASK;
+ if (page_cache_pos(mapping, page->index + 1, 0) > size)
+ end = page_cache_offset(mapping, size);
else
- end = PAGE_CACHE_SIZE;
+ end = page_cache_size(mapping);

ret = block_prepare_write(page, 0, end, get_block);
if (!ret)
@@ -2258,6 +2266,7 @@ static void end_buffer_read_nobh(struct
int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
get_block_t *get_block)
{
+ struct address_space *mapping = page->mapping;
struct inode *inode = page->mapping->host;
const unsigned blkbits = inode->i_blkbits;
const unsigned blocksize = 1 << blkbits;
@@ -2265,6 +2274,7 @@ int nobh_prepare_write(struct page *page
struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
unsigned block_in_page;
unsigned block_start;
+ unsigned page_size = page_cache_size(mapping);
sector_t block_in_file;
int nr_reads = 0;
int i;
@@ -2274,7 +2284,8 @@ int nobh_prepare_write(struct page *page
if (PageMappedToDisk(page))
return 0;

- block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+ block_in_file = (sector_t)page->index <<
+ (page_cache_shift(mapping) - blkbits);
map_bh.b_page = page;

/*
@@ -2283,7 +2294,7 @@ int nobh_prepare_write(struct page *page
* page is fully mapped-to-disk.
*/
for (block_start = 0, block_in_page = 0;
- block_start < PAGE_CACHE_SIZE;
+ block_start < page_size;
block_in_page++, block_start += blocksize) {
unsigned block_end = block_start + blocksize;
int create;
@@ -2372,7 +2383,7 @@ failed:
* Error recovery is pretty slack. Clear the page and mark it dirty
* so we'll later zero out any blocks which _were_ allocated.
*/
- zero_user(page, 0, PAGE_CACHE_SIZE);
+ zero_user(page, 0, page_size);
SetPageUptodate(page);
set_page_dirty(page);
return ret;
@@ -2386,8 +2397,9 @@ EXPORT_SYMBOL(nobh_prepare_write);
int nobh_commit_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
- struct inode *inode = page->mapping->host;
- loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ loff_t pos = page_cache_pos(mapping, page->index, to);

SetPageUptodate(page);
set_page_dirty(page);
@@ -2407,9 +2419,10 @@ EXPORT_SYMBOL(nobh_commit_write);
int nobh_writepage(struct page *page, get_block_t *get_block,
struct writeback_control *wbc)
{
- struct inode * const inode = page->mapping->host;
+ struct address_space *mapping = page->mapping;
+ struct inode * const inode = mapping->host;
loff_t i_size = i_size_read(inode);
- const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ const pgoff_t end_index = page_cache_offset(mapping, i_size);
unsigned offset;
int ret;

@@ -2418,7 +2431,7 @@ int nobh_writepage(struct page *page, ge
goto out;

/* Is the page fully outside i_size? (truncate in progress) */
- offset = i_size & (PAGE_CACHE_SIZE-1);
+ offset = page_cache_offset(mapping, i_size);
if (page->index >= end_index+1 || !offset) {
/*
* The page may have dirty, unmapped buffers. For example,
@@ -2441,7 +2454,7 @@ int nobh_writepage(struct page *page, ge
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ zero_user_segment(page, offset, page_cache_size(mapping));
out:
ret = mpage_writepage(page, get_block, wbc);
if (ret == -EAGAIN)
@@ -2457,8 +2470,8 @@ int nobh_truncate_page(struct address_sp
{
struct inode *inode = mapping->host;
unsigned blocksize = 1 << inode->i_blkbits;
- pgoff_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ pgoff_t index = page_cache_index(mapping, from);
+ unsigned offset = page_cache_offset(mapping, from);
unsigned to;
struct page *page;
const struct address_space_operations *a_ops = mapping->a_ops;
@@ -2475,7 +2488,7 @@ int nobh_truncate_page(struct address_sp
to = (offset + blocksize) & ~(blocksize - 1);
ret = a_ops->prepare_write(NULL, page, offset, to);
if (ret == 0) {
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ zero_user_segment(page, offset, page_cache_size(mapping));
/*
* It would be more correct to call aops->commit_write()
* here, but this is more efficient.
@@ -2493,8 +2506,8 @@ EXPORT_SYMBOL(nobh_truncate_page);
int block_truncate_page(struct address_space *mapping,
loff_t from, get_block_t *get_block)
{
- pgoff_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ pgoff_t index = page_cache_index(mapping, from);
+ unsigned offset = page_cache_offset(mapping, from);
unsigned blocksize;
sector_t iblock;
unsigned length, pos;
@@ -2511,8 +2524,8 @@ int block_truncate_page(struct address_s
return 0;

length = blocksize - length;
- iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
+ iblock = (sector_t)index <<
+ (page_cache_shift(mapping) - inode->i_blkbits);
page = grab_cache_page(mapping, index);
err = -ENOMEM;
if (!page)
@@ -2571,9 +2584,10 @@ out:
int block_write_full_page(struct page *page, get_block_t *get_block,
struct writeback_control *wbc)
{
- struct inode * const inode = page->mapping->host;
+ struct address_space *mapping = page->mapping;
+ struct inode * const inode = mapping->host;
loff_t i_size = i_size_read(inode);
- const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ const pgoff_t end_index = page_cache_index(mapping, i_size);
unsigned offset;

/* Is the page fully inside i_size? */
@@ -2581,7 +2595,7 @@ int block_write_full_page(struct page *p
return __block_write_full_page(inode, page, get_block, wbc);

/* Is the page fully outside i_size? (truncate in progress) */
- offset = i_size & (PAGE_CACHE_SIZE-1);
+ offset = page_cache_offset(mapping, i_size);
if (page->index >= end_index+1 || !offset) {
/*
* The page may have dirty, unmapped buffers. For example,
@@ -2600,7 +2614,7 @@ int block_write_full_page(struct page *p
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ zero_user_segment(page, offset, page_cache_size(mapping));
return __block_write_full_page(inode, page, get_block, wbc);
}

@@ -2854,7 +2868,7 @@ int try_to_free_buffers(struct page *pag
* dirty bit from being lost.
*/
if (ret)
- cancel_dirty_page(page, PAGE_CACHE_SIZE);
+ cancel_dirty_page(page, page_cache_size(mapping));
spin_unlock(&mapping->private_lock);
out:
if (buffers_to_free) {

--

2007-08-30 09:22:47

by Dmitri Monakhov

[permalink] [raw]

Subject: Re: [11/36] Use page_cache_xxx in fs/buffer.c

On 12:06 Tue 28 Aug , [email protected] wrote:
> Use page_cache_xxx in fs/buffer.c.
submit_bh wasn't changed this means what bio pages may have huge size
without respect to queue reqsrictions (q->max_hw_segments, and etc)
At least driver/md/raid0 will be broken by you'r patch.
>
> We have a special situation in set_bh_page() since reiserfs calls that
> function before setting up the mapping. So retrieve the page size
> from the page struct rather than the mapping.
>
> Signed-off-by: Christoph Lameter <[email protected]>
> ---
> fs/buffer.c | 110 +++++++++++++++++++++++++++++++++---------------------------
> 1 file changed, 62 insertions(+), 48 deletions(-)
>
> Index: linux-2.6/fs/buffer.c
> ===================================================================
> --- linux-2.6.orig/fs/buffer.c 2007-08-28 11:37:13.000000000 -0700
> +++ linux-2.6/fs/buffer.c 2007-08-28 11:37:58.000000000 -0700
> @@ -257,7 +257,7 @@ __find_get_block_slow(struct block_devic
> struct page *page;
> int all_mapped = 1;
>
> - index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
> + index = block >> (page_cache_shift(bd_mapping) - bd_inode->i_blkbits);
> page = find_get_page(bd_mapping, index);
> if (!page)
> goto out;
> @@ -697,7 +697,7 @@ static int __set_page_dirty(struct page
>
> if (mapping_cap_account_dirty(mapping)) {
> __inc_zone_page_state(page, NR_FILE_DIRTY);
> - task_io_account_write(PAGE_CACHE_SIZE);
> + task_io_account_write(page_cache_size(mapping));
> }
> radix_tree_tag_set(&mapping->page_tree,
> page_index(page), PAGECACHE_TAG_DIRTY);
> @@ -891,10 +891,11 @@ struct buffer_head *alloc_page_buffers(s
> {
> struct buffer_head *bh, *head;
> long offset;
> + unsigned int page_size = page_cache_size(page->mapping);
>
> try_again:
> head = NULL;
> - offset = PAGE_SIZE;
> + offset = page_size;
> while ((offset -= size) >= 0) {
> bh = alloc_buffer_head(GFP_NOFS);
> if (!bh)
> @@ -1426,7 +1427,7 @@ void set_bh_page(struct buffer_head *bh,
> struct page *page, unsigned long offset)
> {
> bh->b_page = page;
> - BUG_ON(offset >= PAGE_SIZE);
> + BUG_ON(offset >= compound_size(page));
> if (PageHighMem(page))
> /*
> * This catches illegal uses and preserves the offset:
> @@ -1605,6 +1606,7 @@ static int __block_write_full_page(struc
> struct buffer_head *bh, *head;
> const unsigned blocksize = 1 << inode->i_blkbits;
> int nr_underway = 0;
> + struct address_space *mapping = inode->i_mapping;
>
> BUG_ON(!PageLocked(page));
>
> @@ -1625,7 +1627,8 @@ static int __block_write_full_page(struc
> * handle that here by just cleaning them.
> */
>
> - block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
> + block = (sector_t)page->index <<
> + (page_cache_shift(mapping) - inode->i_blkbits);
> head = page_buffers(page);
> bh = head;
>
> @@ -1742,7 +1745,7 @@ recover:
> } while ((bh = bh->b_this_page) != head);
> SetPageError(page);
> BUG_ON(PageWriteback(page));
> - mapping_set_error(page->mapping, err);
> + mapping_set_error(mapping, err);
> set_page_writeback(page);
> do {
> struct buffer_head *next = bh->b_this_page;
> @@ -1767,8 +1770,8 @@ static int __block_prepare_write(struct
> struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
>
> BUG_ON(!PageLocked(page));
> - BUG_ON(from > PAGE_CACHE_SIZE);
> - BUG_ON(to > PAGE_CACHE_SIZE);
> + BUG_ON(from > page_cache_size(inode->i_mapping));
> + BUG_ON(to > page_cache_size(inode->i_mapping));
> BUG_ON(from > to);
>
> blocksize = 1 << inode->i_blkbits;
> @@ -1777,7 +1780,8 @@ static int __block_prepare_write(struct
> head = page_buffers(page);
>
> bbits = inode->i_blkbits;
> - block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
> + block = (sector_t)page->index <<
> + (page_cache_shift(inode->i_mapping) - bbits);
>
> for(bh = head, block_start = 0; bh != head || !block_start;
> block++, block_start=block_end, bh = bh->b_this_page) {
> @@ -1921,7 +1925,8 @@ int block_read_full_page(struct page *pa
> create_empty_buffers(page, blocksize, 0);
> head = page_buffers(page);
>
> - iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
> + iblock = (sector_t)page->index <<
> + (page_cache_shift(page->mapping) - inode->i_blkbits);
> lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
> bh = head;
> nr = 0;
> @@ -2045,7 +2050,7 @@ int generic_cont_expand(struct inode *in
> pgoff_t index;
> unsigned int offset;
>
> - offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
> + offset = page_cache_offset(inode->i_mapping, size); /* Within page */
>
> /* ugh. in prepare/commit_write, if from==to==start of block, we
> ** skip the prepare. make sure we never send an offset for the start
> @@ -2055,7 +2060,7 @@ int generic_cont_expand(struct inode *in
> /* caller must handle this extra byte. */
> offset++;
> }
> - index = size >> PAGE_CACHE_SHIFT;
> + index = page_cache_index(inode->i_mapping, size);
>
> return __generic_cont_expand(inode, size, index, offset);
> }
> @@ -2063,8 +2068,8 @@ int generic_cont_expand(struct inode *in
> int generic_cont_expand_simple(struct inode *inode, loff_t size)
> {
> loff_t pos = size - 1;
> - pgoff_t index = pos >> PAGE_CACHE_SHIFT;
> - unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
> + pgoff_t index = page_cache_index(inode->i_mapping, pos);
> + unsigned int offset = page_cache_offset(inode->i_mapping, pos) + 1;
>
> /* prepare/commit_write can handle even if from==to==start of block. */
> return __generic_cont_expand(inode, size, index, offset);
> @@ -2086,28 +2091,28 @@ int cont_prepare_write(struct page *page
> unsigned zerofrom;
> unsigned blocksize = 1 << inode->i_blkbits;
>
> - while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
> + while (page->index > (pgpos = page_cache_index(mapping, *bytes))) {
> status = -ENOMEM;
> new_page = grab_cache_page(mapping, pgpos);
> if (!new_page)
> goto out;
> /* we might sleep */
> - if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
> + if (page_cache_index(mapping, *bytes) != pgpos) {
> unlock_page(new_page);
> page_cache_release(new_page);
> continue;
> }
> - zerofrom = *bytes & ~PAGE_CACHE_MASK;
> + zerofrom = page_cache_offset(mapping, *bytes);
> if (zerofrom & (blocksize-1)) {
> *bytes |= (blocksize-1);
> (*bytes)++;
> }
> status = __block_prepare_write(inode, new_page, zerofrom,
> - PAGE_CACHE_SIZE, get_block);
> + page_cache_size(mapping), get_block);
> if (status)
> goto out_unmap;
> - zero_user_segment(new_page, zerofrom, PAGE_CACHE_SIZE);
> - generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
> + zero_user_segment(new_page, zerofrom, page_cache_size(mapping));
> + generic_commit_write(NULL, new_page, zerofrom, page_cache_size(mapping));
> unlock_page(new_page);
> page_cache_release(new_page);
> }
> @@ -2117,7 +2122,7 @@ int cont_prepare_write(struct page *page
> zerofrom = offset;
> } else {
> /* page covers the boundary, find the boundary offset */
> - zerofrom = *bytes & ~PAGE_CACHE_MASK;
> + zerofrom = page_cache_offset(mapping, *bytes);
>
> /* if we will expand the thing last block will be filled */
> if (to > zerofrom && (zerofrom & (blocksize-1))) {
> @@ -2169,8 +2174,9 @@ int block_commit_write(struct page *page
> int generic_commit_write(struct file *file, struct page *page,
> unsigned from, unsigned to)
> {
> - struct inode *inode = page->mapping->host;
> - loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
> + struct address_space *mapping = page->mapping;
> + struct inode *inode = mapping->host;
> + loff_t pos = page_cache_pos(mapping, page->index, to);
> __block_commit_write(inode,page,from,to);
> /*
> * No need to use i_size_read() here, the i_size
> @@ -2206,20 +2212,22 @@ block_page_mkwrite(struct vm_area_struct
> unsigned long end;
> loff_t size;
> int ret = -EINVAL;
> + struct address_space *mapping;
>
> lock_page(page);
> + mapping = page->mapping;
> size = i_size_read(inode);
> - if ((page->mapping != inode->i_mapping) ||
> + if ((mapping != inode->i_mapping) ||
> (page_offset(page) > size)) {
> /* page got truncated out from underneath us */
> goto out_unlock;
> }
>
> /* page is wholly or partially inside EOF */
> - if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
> - end = size & ~PAGE_CACHE_MASK;
> + if (page_cache_pos(mapping, page->index + 1, 0) > size)
> + end = page_cache_offset(mapping, size);
> else
> - end = PAGE_CACHE_SIZE;
> + end = page_cache_size(mapping);
>
> ret = block_prepare_write(page, 0, end, get_block);
> if (!ret)
> @@ -2258,6 +2266,7 @@ static void end_buffer_read_nobh(struct
> int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
> get_block_t *get_block)
> {
> + struct address_space *mapping = page->mapping;
> struct inode *inode = page->mapping->host;
> const unsigned blkbits = inode->i_blkbits;
> const unsigned blocksize = 1 << blkbits;
> @@ -2265,6 +2274,7 @@ int nobh_prepare_write(struct page *page
> struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
> unsigned block_in_page;
> unsigned block_start;
> + unsigned page_size = page_cache_size(mapping);
> sector_t block_in_file;
> int nr_reads = 0;
> int i;
> @@ -2274,7 +2284,8 @@ int nobh_prepare_write(struct page *page
> if (PageMappedToDisk(page))
> return 0;
>
> - block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
> + block_in_file = (sector_t)page->index <<
> + (page_cache_shift(mapping) - blkbits);
> map_bh.b_page = page;
>
> /*
> @@ -2283,7 +2294,7 @@ int nobh_prepare_write(struct page *page
> * page is fully mapped-to-disk.
> */
> for (block_start = 0, block_in_page = 0;
> - block_start < PAGE_CACHE_SIZE;
> + block_start < page_size;
> block_in_page++, block_start += blocksize) {
> unsigned block_end = block_start + blocksize;
> int create;
> @@ -2372,7 +2383,7 @@ failed:
> * Error recovery is pretty slack. Clear the page and mark it dirty
> * so we'll later zero out any blocks which _were_ allocated.
> */
> - zero_user(page, 0, PAGE_CACHE_SIZE);
> + zero_user(page, 0, page_size);
> SetPageUptodate(page);
> set_page_dirty(page);
> return ret;
> @@ -2386,8 +2397,9 @@ EXPORT_SYMBOL(nobh_prepare_write);
> int nobh_commit_write(struct file *file, struct page *page,
> unsigned from, unsigned to)
> {
> - struct inode *inode = page->mapping->host;
> - loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
> + struct address_space *mapping = page->mapping;
> + struct inode *inode = mapping->host;
> + loff_t pos = page_cache_pos(mapping, page->index, to);
>
> SetPageUptodate(page);
> set_page_dirty(page);
> @@ -2407,9 +2419,10 @@ EXPORT_SYMBOL(nobh_commit_write);
> int nobh_writepage(struct page *page, get_block_t *get_block,
> struct writeback_control *wbc)
> {
> - struct inode * const inode = page->mapping->host;
> + struct address_space *mapping = page->mapping;
> + struct inode * const inode = mapping->host;
> loff_t i_size = i_size_read(inode);
> - const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
> + const pgoff_t end_index = page_cache_offset(mapping, i_size);
> unsigned offset;
> int ret;
>
> @@ -2418,7 +2431,7 @@ int nobh_writepage(struct page *page, ge
> goto out;
>
> /* Is the page fully outside i_size? (truncate in progress) */
> - offset = i_size & (PAGE_CACHE_SIZE-1);
> + offset = page_cache_offset(mapping, i_size);
> if (page->index >= end_index+1 || !offset) {
> /*
> * The page may have dirty, unmapped buffers. For example,
> @@ -2441,7 +2454,7 @@ int nobh_writepage(struct page *page, ge
> * the page size, the remaining memory is zeroed when mapped, and
> * writes to that region are not written out to the file."
> */
> - zero_user_segment(page, offset, PAGE_CACHE_SIZE);
> + zero_user_segment(page, offset, page_cache_size(mapping));
> out:
> ret = mpage_writepage(page, get_block, wbc);
> if (ret == -EAGAIN)
> @@ -2457,8 +2470,8 @@ int nobh_truncate_page(struct address_sp
> {
> struct inode *inode = mapping->host;
> unsigned blocksize = 1 << inode->i_blkbits;
> - pgoff_t index = from >> PAGE_CACHE_SHIFT;
> - unsigned offset = from & (PAGE_CACHE_SIZE-1);
> + pgoff_t index = page_cache_index(mapping, from);
> + unsigned offset = page_cache_offset(mapping, from);
> unsigned to;
> struct page *page;
> const struct address_space_operations *a_ops = mapping->a_ops;
> @@ -2475,7 +2488,7 @@ int nobh_truncate_page(struct address_sp
> to = (offset + blocksize) & ~(blocksize - 1);
> ret = a_ops->prepare_write(NULL, page, offset, to);
> if (ret == 0) {
> - zero_user_segment(page, offset, PAGE_CACHE_SIZE);
> + zero_user_segment(page, offset, page_cache_size(mapping));
> /*
> * It would be more correct to call aops->commit_write()
> * here, but this is more efficient.
> @@ -2493,8 +2506,8 @@ EXPORT_SYMBOL(nobh_truncate_page);
> int block_truncate_page(struct address_space *mapping,
> loff_t from, get_block_t *get_block)
> {
> - pgoff_t index = from >> PAGE_CACHE_SHIFT;
> - unsigned offset = from & (PAGE_CACHE_SIZE-1);
> + pgoff_t index = page_cache_index(mapping, from);
> + unsigned offset = page_cache_offset(mapping, from);
> unsigned blocksize;
> sector_t iblock;
> unsigned length, pos;
> @@ -2511,8 +2524,8 @@ int block_truncate_page(struct address_s
> return 0;
>
> length = blocksize - length;
> - iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
> -
> + iblock = (sector_t)index <<
> + (page_cache_shift(mapping) - inode->i_blkbits);
> page = grab_cache_page(mapping, index);
> err = -ENOMEM;
> if (!page)
> @@ -2571,9 +2584,10 @@ out:
> int block_write_full_page(struct page *page, get_block_t *get_block,
> struct writeback_control *wbc)
> {
> - struct inode * const inode = page->mapping->host;
> + struct address_space *mapping = page->mapping;
> + struct inode * const inode = mapping->host;
> loff_t i_size = i_size_read(inode);
> - const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
> + const pgoff_t end_index = page_cache_index(mapping, i_size);
> unsigned offset;
>
> /* Is the page fully inside i_size? */
> @@ -2581,7 +2595,7 @@ int block_write_full_page(struct page *p
> return __block_write_full_page(inode, page, get_block, wbc);
>
> /* Is the page fully outside i_size? (truncate in progress) */
> - offset = i_size & (PAGE_CACHE_SIZE-1);
> + offset = page_cache_offset(mapping, i_size);
> if (page->index >= end_index+1 || !offset) {
> /*
> * The page may have dirty, unmapped buffers. For example,
> @@ -2600,7 +2614,7 @@ int block_write_full_page(struct page *p
> * the page size, the remaining memory is zeroed when mapped, and
> * writes to that region are not written out to the file."
> */
> - zero_user_segment(page, offset, PAGE_CACHE_SIZE);
> + zero_user_segment(page, offset, page_cache_size(mapping));
> return __block_write_full_page(inode, page, get_block, wbc);
> }
>
> @@ -2854,7 +2868,7 @@ int try_to_free_buffers(struct page *pag
> * dirty bit from being lost.
> */
> if (ret)
> - cancel_dirty_page(page, PAGE_CACHE_SIZE);
> + cancel_dirty_page(page, page_cache_size(mapping));
> spin_unlock(&mapping->private_lock);
> out:
> if (buffers_to_free) {
>
> --
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2007-08-30 18:14:52

by Christoph Lameter

[permalink] [raw]

Subject: Re: [11/36] Use page_cache_xxx in fs/buffer.c

On Thu, 30 Aug 2007, Dmitry Monakhov wrote:

> On 12:06 Tue 28 Aug , [email protected] wrote:
> > Use page_cache_xxx in fs/buffer.c.
> submit_bh wasn't changed this means what bio pages may have huge size
> without respect to queue reqsrictions (q->max_hw_segments, and etc)
> At least driver/md/raid0 will be broken by you'r patch.

Hmmm... So we need to check the page size and generate multiple requests
in submit_bh?

2007-08-31 01:48:03

by Christoph Lameter

[permalink] [raw]

Subject: Re: [11/36] Use page_cache_xxx in fs/buffer.c

This may already be handled?

submit_bh() calls submit_bio() which calls __generic_make_request() and
there we do:

if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) {
printk("bio too big device %s (%u > %u)\n",
bdevname(bio->bi_bdev, b),
bio_sectors(bio),
q->max_hw_sectors);
goto end_io;
}

So if we try to push a too large buffer down with submit_bh() we get a
failure.

2007-08-31 06:57:09

[permalink] [raw]

Subject: Re: [11/36] Use page_cache_xxx in fs/buffer.c

On Thu, Aug 30 2007, Christoph Lameter wrote:
> This may already be handled?
>
> submit_bh() calls submit_bio() which calls __generic_make_request() and
> there we do:
>
> if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) {
> printk("bio too big device %s (%u > %u)\n",
> bdevname(bio->bi_bdev, b),
> bio_sectors(bio),
> q->max_hw_sectors);
> goto end_io;
> }
>
> So if we try to push a too large buffer down with submit_bh() we get a
> failure.

Only partly, you may be violating a number of other restrictions (size
is many things, not just length of the data).

--
Jens Axboe

2007-08-31 07:03:53

by Christoph Lameter

[permalink] [raw]

Subject: Re: [11/36] Use page_cache_xxx in fs/buffer.c

On Fri, 31 Aug 2007, Jens Axboe wrote:

> > So if we try to push a too large buffer down with submit_bh() we get a
> > failure.
>
> Only partly, you may be violating a number of other restrictions (size
> is many things, not just length of the data).

Could you be more specific?

2007-08-31 07:11:57

[permalink] [raw]

Subject: Re: [11/36] Use page_cache_xxx in fs/buffer.c

On Fri, Aug 31 2007, Christoph Lameter wrote:
> On Fri, 31 Aug 2007, Jens Axboe wrote:
>
> > > So if we try to push a too large buffer down with submit_bh() we get a
> > > failure.
> >
> > Only partly, you may be violating a number of other restrictions (size
> > is many things, not just length of the data).
>
> Could you be more specific?

Size of a single segment, for instance. Or if the bio crosses a dma
boundary. If your block is 64kb and the maximum segment size is 32kb,
then you would need to clone the bio and split it into two.

Things like that. This isn't a problem with single page requests, as we
based the lower possible boundaries on that.

--
Jens Axboe

2007-08-31 07:17:50

by Christoph Lameter

[permalink] [raw]

Subject: Re: [11/36] Use page_cache_xxx in fs/buffer.c

On Fri, 31 Aug 2007, Jens Axboe wrote:

> > Could you be more specific?
>
> Size of a single segment, for instance. Or if the bio crosses a dma
> boundary. If your block is 64kb and the maximum segment size is 32kb,
> then you would need to clone the bio and split it into two.

A DMA boundary cannot be crossed AFAIK. The compound pages are aligned to
the power of two boundaries and the page allocator will not create pages
that cross the zone boundaries.

It looks like the code will correctly signal a failure if you try to write
a 64k block on a device with a maximum segment size of 32k. Isnt this
okay? One would not want to use a larger block size than supported by the
underlying hardware?

> Things like that. This isn't a problem with single page requests, as we
> based the lower possible boundaries on that.

submit_bh() is used to submit a single buffer and I think that was our
main concern here.

2007-08-31 07:26:25

[permalink] [raw]

Subject: Re: [11/36] Use page_cache_xxx in fs/buffer.c

On Fri, Aug 31 2007, Christoph Lameter wrote:
> On Fri, 31 Aug 2007, Jens Axboe wrote:
>
> > > Could you be more specific?
> >
> > Size of a single segment, for instance. Or if the bio crosses a dma
> > boundary. If your block is 64kb and the maximum segment size is 32kb,
> > then you would need to clone the bio and split it into two.
>
> A DMA boundary cannot be crossed AFAIK. The compound pages are aligned to
> the power of two boundaries and the page allocator will not create pages
> that cross the zone boundaries.

With a 64k page and a dma boundary of 0x7fff, that's two segments.

> It looks like the code will correctly signal a failure if you try to write
> a 64k block on a device with a maximum segment size of 32k. Isnt this
> okay? One would not want to use a larger block size than supported by the
> underlying hardware?

That's just the size in sectors limitation again. And that also needs to
be handled, the fact that it currently errors out is reassuring but
definitely a long term solution. You don't want to knowingly setup such
a system where the fs block size is larger than what the hardware would
want, but it should work. You could be moving hardware around, for
recovery or otherwise.

> > Things like that. This isn't a problem with single page requests, as we
> > based the lower possible boundaries on that.
>
> submit_bh() is used to submit a single buffer and I think that was our
> main concern here.

And how large can that be?

--
Jens Axboe

2007-08-31 07:33:44

by Christoph Lameter

[permalink] [raw]

Subject: Re: [11/36] Use page_cache_xxx in fs/buffer.c

On Fri, 31 Aug 2007, Jens Axboe wrote:

> > A DMA boundary cannot be crossed AFAIK. The compound pages are aligned to
> > the power of two boundaries and the page allocator will not create pages
> > that cross the zone boundaries.
>
> With a 64k page and a dma boundary of 0x7fff, that's two segments.

Ok so DMA memory restrictions not conforming to the DMA zones? The
example is a bit weird. DMA only to the first 32k of memory? If the limit
would be higher like 16MB then we would not have an issue. Is there really
a device that can only do I/O to the first 32k of memory?

How do we split that up today? We could add processing to submit_bio to
check for the boundary and create two bios.

> > submit_bh() is used to submit a single buffer and I think that was our
> > main concern here.
>
> And how large can that be?

As large as mkxxxfs allowed it to be. For XFS and extX with the current
patchset 32k is the limit (64k with the fixes to ext2) but a new
filesystem could theoretically use a larger blocksize.

2007-08-31 07:43:52

[permalink] [raw]

Subject: Re: [11/36] Use page_cache_xxx in fs/buffer.c

On Fri, Aug 31 2007, Christoph Lameter wrote:
> On Fri, 31 Aug 2007, Jens Axboe wrote:
>
> > > A DMA boundary cannot be crossed AFAIK. The compound pages are aligned to
> > > the power of two boundaries and the page allocator will not create pages
> > > that cross the zone boundaries.
> >
> > With a 64k page and a dma boundary of 0x7fff, that's two segments.
>
> Ok so DMA memory restrictions not conforming to the DMA zones? The
> example is a bit weird. DMA only to the first 32k of memory? If the limit
> would be higher like 16MB then we would not have an issue. Is there really
> a device that can only do I/O to the first 32k of memory?

They have nothing to do with each other, you are mixing things up. It
has nothing to do with the device being able to dma into that memory or
not, we have fine existing infrastructure to handle that. But different
hardware have different characteristics on what a single segment is. You
can say "a single segment cannot cross a 32kb boundary". So from the
example above, your single 64k page may need to be split into two
segments. Or it could have a maximum segment size of 32k, in which case
it would have to be split as well.

Do you see what I mean now?

> How do we split that up today? We could add processing to submit_bio
> to check for the boundary and create two bios.

But we do not split them up today - see what I wrote! Today we impose
the restriction that a device must be able to handle a single "normal"
page, and if it can't do that, it has to split it up itself.

But yes, you would have to create some out-of-line function to use
bio_split() until you have chopped things down enough. It's not a good
thing for performance naturally, but if we consider this a "just make it
work" fallback, I don't think it's too bad. You want to make a note of
that it is happening though, so people realize that it is happening.

> > > submit_bh() is used to submit a single buffer and I think that was
> > > our main concern here.
> >
> > And how large can that be?
>
> As large as mkxxxfs allowed it to be. For XFS and extX with the
> current patchset 32k is the limit (64k with the fixes to ext2) but a
> new filesystem could theoretically use a larger blocksize.

OK, since it goes direct to bio anyway, it can be handled there.

--
Jens Axboe

2007-08-31 07:52:34

by Christoph Lameter

[permalink] [raw]

Subject: Re: [11/36] Use page_cache_xxx in fs/buffer.c

On Fri, 31 Aug 2007, Jens Axboe wrote:

> They have nothing to do with each other, you are mixing things up. It
> has nothing to do with the device being able to dma into that memory or
> not, we have fine existing infrastructure to handle that. But different
> hardware have different characteristics on what a single segment is. You
> can say "a single segment cannot cross a 32kb boundary". So from the
> example above, your single 64k page may need to be split into two
> segments. Or it could have a maximum segment size of 32k, in which case
> it would have to be split as well.
>
> Do you see what I mean now?

Ok. So another solution maybe to limit the blocksizes that can be used
with a device?

> > How do we split that up today? We could add processing to submit_bio
> > to check for the boundary and create two bios.
>
> But we do not split them up today - see what I wrote! Today we impose
> the restriction that a device must be able to handle a single "normal"
> page, and if it can't do that, it has to split it up itself.
>
> But yes, you would have to create some out-of-line function to use
> bio_split() until you have chopped things down enough. It's not a good
> thing for performance naturally, but if we consider this a "just make it
> work" fallback, I don't think it's too bad. You want to make a note of
> that it is happening though, so people realize that it is happening.

Hmmmm.. We could keep the existing scheme too and check that device
drivers split things up if they are too large? Isnt it possible today
to create a huge bio of 2M for huge pages and send it to a device?

2007-08-31 08:12:22

[permalink] [raw]

Subject: Re: [11/36] Use page_cache_xxx in fs/buffer.c

On Fri, Aug 31 2007, Christoph Lameter wrote:
> On Fri, 31 Aug 2007, Jens Axboe wrote:
>
> > They have nothing to do with each other, you are mixing things up. It
> > has nothing to do with the device being able to dma into that memory or
> > not, we have fine existing infrastructure to handle that. But different
> > hardware have different characteristics on what a single segment is. You
> > can say "a single segment cannot cross a 32kb boundary". So from the
> > example above, your single 64k page may need to be split into two
> > segments. Or it could have a maximum segment size of 32k, in which case
> > it would have to be split as well.
> >
> > Do you see what I mean now?
>
> Ok. So another solution maybe to limit the blocksizes that can be used
> with a device?

That'd work for creation, but not for moving things around.

> > > How do we split that up today? We could add processing to submit_bio
> > > to check for the boundary and create two bios.
> >
> > But we do not split them up today - see what I wrote! Today we impose
> > the restriction that a device must be able to handle a single "normal"
> > page, and if it can't do that, it has to split it up itself.
> >
> > But yes, you would have to create some out-of-line function to use
> > bio_split() until you have chopped things down enough. It's not a good
> > thing for performance naturally, but if we consider this a "just make it
> > work" fallback, I don't think it's too bad. You want to make a note of
> > that it is happening though, so people realize that it is happening.
>
> Hmmmm.. We could keep the existing scheme too and check that device
> drivers split things up if they are too large? Isnt it possible today
> to create a huge bio of 2M for huge pages and send it to a device?

Not sure, aren't the constituents of compound pages the basis for IO?

--
Jens Axboe

2007-08-31 08:38:26

by Dmitri Monakhov

[permalink] [raw]

Subject: Re: [11/36] Use page_cache_xxx in fs/buffer.c

On 00:52 Fri 31 Aug , Christoph Lameter wrote:
> On Fri, 31 Aug 2007, Jens Axboe wrote:
>
> > They have nothing to do with each other, you are mixing things up. It
> > has nothing to do with the device being able to dma into that memory or
> > not, we have fine existing infrastructure to handle that. But different
> > hardware have different characteristics on what a single segment is. You
> > can say "a single segment cannot cross a 32kb boundary". So from the
> > example above, your single 64k page may need to be split into two
> > segments. Or it could have a maximum segment size of 32k, in which case
> > it would have to be split as well.
> >
> > Do you see what I mean now?
>
> Ok. So another solution maybe to limit the blocksizes that can be used
> with a device?
IMHO It is not good because after fs was created with big blksize it's image
cant be used on other devices.
We may just rewrite submit_bh simular to drivers/md/dm-io.c:do_region
with following pseudocode:

remaning = super_page_size();
while (remaining) {
init_bio(bio);
/*Try and add as many pages as possible*/
while (remaining) {
dp->get_page(dp, &page, &len, &offset);
len = min(len,
to_bytes(remaining));
if(!bio_add_page(bio, page, len, offset))
break;
offset = 0;
remaining -= to_sector(len);
dp->next_page(dp);
}
atomic_inc(&io->count);
submit_bio(rw, bio);
}
> > > How do we split that up today? We could add processing to submit_bio
> > > to check for the boundary and create two bios.
> >
> > But we do not split them up today - see what I wrote! Today we impose
> > the restriction that a device must be able to handle a single "normal"
> > page, and if it can't do that, it has to split it up itself.
> >
> > But yes, you would have to create some out-of-line function to use
> > bio_split() until you have chopped things down enough. It's not a good
> > thing for performance naturally, but if we consider this a "just make it
> > work" fallback, I don't think it's too bad. You want to make a note of
> > that it is happening though, so people realize that it is happening.
>
> Hmmmm.. We could keep the existing scheme too and check that device
> drivers split things up if they are too large? Isnt it possible today
> to create a huge bio of 2M for huge pages and send it to a device?

2007-08-31 15:22:58

by Christoph Lameter

[permalink] [raw]

Subject: Re: [11/36] Use page_cache_xxx in fs/buffer.c

On Fri, 31 Aug 2007, Jens Axboe wrote:

> > Ok. So another solution maybe to limit the blocksizes that can be used
> > with a device?
>
> That'd work for creation, but not for moving things around.

What do you mean by moving things around? Creation binds a filesystem to a
device.

> > Hmmmm.. We could keep the existing scheme too and check that device
> > drivers split things up if they are too large? Isnt it possible today
> > to create a huge bio of 2M for huge pages and send it to a device?
>
> Not sure, aren't the constituents of compound pages the basis for IO?

get_user_pages() serializes compound pages into the base pages. But doesnt
the I/O layer coalesce these later into 2M chunks again?

2007-08-31 15:28:50

by Christoph Lameter

[permalink] [raw]

Subject: Re: [11/36] Use page_cache_xxx in fs/buffer.c

On Fri, 31 Aug 2007, Dmitry Monakhov wrote:

> > Ok. So another solution maybe to limit the blocksizes that can be used
> > with a device?
> IMHO It is not good because after fs was created with big blksize it's image
> cant be used on other devices.

Ok so a raw copy of the partition would do this?

> We may just rewrite submit_bh simular to drivers/md/dm-io.c:do_region
> with following pseudocode:
>
> remaning = super_page_size();

That would be compound_size(page)

> while (remaining) {
> init_bio(bio);
> /*Try and add as many pages as possible*/

This seems to be doing the same as get_user_pages() serializing the
compound page.

> while (remaining) {
> dp->get_page(dp, &page, &len, &offset);
> len = min(len,
> to_bytes(remaining));
> if(!bio_add_page(bio, page, len, offset))
> break;
> offset = 0;
> remaining -= to_sector(len);
> dp->next_page(dp);
> }
> atomic_inc(&io->count);
> submit_bio(rw, bio);
> }

Another solution may be to not serialize but instead determine the maximum
segment length and generate bios that reference various subsection of the
compound page of that length? That way you do not serialize and later
coalesce again.

2007-08-31 16:39:44

[permalink] [raw]

Subject: Re: [11/36] Use page_cache_xxx in fs/buffer.c

On Fri, 31 August 2007 08:22:45 -0700, Christoph Lameter wrote:
>
> What do you mean by moving things around? Creation binds a filesystem to a
> device.

Create the filesystem on a usb key, then move it to the next machine,
i suppose.

Or on any other movable medium, including disks, nbd, iSCSI,...

Jörn

--
Doubt is not a pleasant condition, but certainty is an absurd one.
-- Voltaire

2007-08-31 19:00:32

[permalink] [raw]

Subject: Re: [11/36] Use page_cache_xxx in fs/buffer.c

On Fri, Aug 31 2007, Christoph Lameter wrote:
> On Fri, 31 Aug 2007, Jens Axboe wrote:
>
> > > Ok. So another solution maybe to limit the blocksizes that can be used
> > > with a device?
> >
> > That'd work for creation, but not for moving things around.
>
> What do you mean by moving things around? Creation binds a filesystem to a
> device.

Only the bottom part. Change controller, move disk, whatever. There are
lots of ways to change part of the IO path.

> > > Hmmmm.. We could keep the existing scheme too and check that device
> > > drivers split things up if they are too large? Isnt it possible today
> > > to create a huge bio of 2M for huge pages and send it to a device?
> >
> > Not sure, aren't the constituents of compound pages the basis for IO?
>
> get_user_pages() serializes compound pages into the base pages. But doesnt
> the I/O layer coalesce these later into 2M chunks again?

You pretty much hit the nail on the head there yourself. The io layer
_may_ coalesce them all together, but it may also stop at am arbitrary
point and put the remainder in another request.

This situation is different from submitting one huge piece, the above is
what has always happened regardless of whether the origin happens to be
a compound page or not.

--
Jens Axboe