LinuxLists.cc - [PATCH] Handle i_size > s

2007-12-20 17:51:18

Subject: [PATCH] Handle i_size > s_maxbytes correctly

Hi Andrew,

nobody seemed to care about this patch, so could you pull it into -mm so
that it gets broader testing? Thanks.

Honza

--
Jan Kara <[email protected]>
SUSE Labs, CR
---

Although we don't allow writes over s_maxbytes, it can happen that a file's
size is larger than s_maxbytes. For example we can write the file from a
computer with a different architecture (which has larger s_maxbytes), boot
a kernel with a different set of config options (CONFIG_LBD...), or if two
nodes in a [Ocfs2, and likely Gfs2] cluster have mounted the same file
system and have different s_maxbytes. Thus we have to make sure we don't
crash / corrupt data when seeing such file (page offset of the last page
needn't fit into pgoff_t). Firstly, we make read() and mmap() return error
when user tries to access the file above s_maxbytes, secondly we introduce
a function i_size_read_trunc() which returns min(i_size, s_maxbytes) and
use it when determining maximal page offset we are interested in.

Signed-off-by: Jan Kara <[email protected]>
CC: Mark Fasheh <[email protected]>

diff --git a/fs/buffer.c b/fs/buffer.c
index 7249e01..3861118 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1623,7 +1623,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,

BUG_ON(!PageLocked(page));

- last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
+ last_block = (i_size_read_trunc(inode) - 1) >> inode->i_blkbits;

if (!page_has_buffers(page)) {
create_empty_buffers(page, blocksize,
@@ -2084,7 +2084,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
head = page_buffers(page);

iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
+ lblock = (i_size_read_trunc(inode)+blocksize-1) >> inode->i_blkbits;
bh = head;
nr = 0;
i = 0;
@@ -2347,7 +2347,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
int ret = -EINVAL;

lock_page(page);
- size = i_size_read(inode);
+ size = i_size_read_trunc(inode);
if ((page->mapping != inode->i_mapping) ||
(page_offset(page) > size)) {
/* page got truncated out from underneath us */
@@ -2603,7 +2603,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
struct writeback_control *wbc)
{
struct inode * const inode = page->mapping->host;
- loff_t i_size = i_size_read(inode);
+ loff_t i_size = i_size_read_trunc(inode);
const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
unsigned offset;
int ret;
@@ -2803,7 +2803,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
struct writeback_control *wbc)
{
struct inode * const inode = page->mapping->host;
- loff_t i_size = i_size_read(inode);
+ loff_t i_size = i_size_read_trunc(inode);
const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
unsigned offset;

diff --git a/fs/direct-io.c b/fs/direct-io.c
index acf0da1..8223868 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -525,8 +525,8 @@ static int get_more_blocks(struct dio *dio)

create = dio->rw & WRITE;
if (dio->lock_type == DIO_LOCKING) {
- if (dio->block_in_file < (i_size_read(dio->inode) >>
- dio->blkbits))
+ if (dio->block_in_file < (i_size_read_trunc(dio->inode)
+ >> dio->blkbits))
create = 0;
} else if (dio->lock_type == DIO_NO_LOCKING) {
create = 0;
@@ -870,7 +870,8 @@ do_holes:
* Be sure to account for a partial block as the
* last block in the file
*/
- i_size_aligned = ALIGN(i_size_read(dio->inode),
+ i_size_aligned =
+ ALIGN(i_size_read_trunc(dio->inode),
1 << blkbits);
if (dio->block_in_file >=
i_size_aligned >> blkbits) {
@@ -961,7 +962,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
dio->next_block_for_io = -1;

dio->iocb = iocb;
- dio->i_size = i_size_read(inode);
+ dio->i_size = i_size_read_trunc(inode);

spin_lock_init(&dio->bio_lock);
dio->refcount = 1;
diff --git a/fs/mpage.c b/fs/mpage.c
index d54f8f8..c666089 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -190,7 +190,8 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,

block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
- last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+ last_block_in_file = (i_size_read_trunc(inode) + blocksize - 1) >>
+ blkbits;
if (last_block > last_block_in_file)
last_block = last_block_in_file;
page_block = 0;
@@ -468,7 +469,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
struct block_device *boundary_bdev = NULL;
int length;
struct buffer_head map_bh;
- loff_t i_size = i_size_read(inode);
+ loff_t i_size = i_size_read_trunc(inode);
int ret = 0;

if (page_has_buffers(page)) {
diff --git a/fs/read_write.c b/fs/read_write.c
index ea1f94c..ed91acc 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -16,6 +16,7 @@
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
+#include <linux/mount.h>
#include "read_write.h"

#include <asm/uaccess.h>
@@ -263,6 +264,11 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
return -EINVAL;
if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
return -EFAULT;
+ if (unlikely(*pos + count > file->f_vfsmnt->mnt_sb->s_maxbytes)) {
+ if (*pos >= file->f_vfsmnt->mnt_sb->s_maxbytes)
+ return -EOVERFLOW;
+ count = file->f_vfsmnt->mnt_sb->s_maxbytes - *pos;
+ }

ret = rw_verify_area(READ, file, pos, count);
if (ret >= 0) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b3ec4a4..d24ef6c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1394,6 +1394,16 @@ static inline void inode_dec_link_count(struct inode *inode)
mark_inode_dirty(inode);
}

+/* Truncate i_size at s_maxbytes so that pagecache doesn't have problems */
+static inline loff_t i_size_read_trunc(const struct inode *inode)
+{
+ loff_t i_size = i_size_read(inode);
+
+ if (unlikely(inode->i_sb->s_maxbytes < i_size))
+ return inode->i_sb->s_maxbytes;
+ return i_size;
+}
+
extern void touch_atime(struct vfsmount *mnt, struct dentry *dentry);
static inline void file_accessed(struct file *file)
{
diff --git a/mm/filemap.c b/mm/filemap.c
index 188cf5f..eb97b9e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -362,7 +362,7 @@ EXPORT_SYMBOL(sync_page_range_nolock);
*/
int filemap_fdatawait(struct address_space *mapping)
{
- loff_t i_size = i_size_read(mapping->host);
+ loff_t i_size = i_size_read_trunc(mapping->host);

if (i_size == 0)
return 0;
@@ -912,7 +912,7 @@ page_ok:
* another truncate extends the file - this is desired though).
*/

- isize = i_size_read(inode);
+ isize = i_size_read_trunc(inode);
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
if (unlikely(!isize || index > end_index)) {
page_cache_release(page);
@@ -1298,7 +1298,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
int did_readaround = 0;
int ret = 0;

- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ size = (i_size_read_trunc(inode) + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
if (vmf->pgoff >= size)
return VM_FAULT_SIGBUS;

@@ -1373,7 +1374,7 @@ retry_find:
goto page_not_uptodate;

/* Must recheck i_size under page lock */
- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ size = (i_size_read_trunc(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (unlikely(vmf->pgoff >= size)) {
unlock_page(page);
page_cache_release(page);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 32132f3..df26ef5 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -63,7 +63,7 @@ do_xip_mapping_read(struct address_space *mapping,
index = *ppos >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;

- isize = i_size_read(inode);
+ isize = i_size_read_trunc(inode);
if (!isize)
goto out;

@@ -219,7 +219,7 @@ static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf)

/* XXX: are VM_FAULT_ codes OK? */

- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ size = (i_size_read_trunc(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (vmf->pgoff >= size)
return VM_FAULT_SIGBUS;

diff --git a/mm/mmap.c b/mm/mmap.c
index facc1a7..b2ee331 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -980,6 +980,13 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
if (locks_verify_locked(inode))
return -EAGAIN;

+ /*
+ * Make sure we don't map more than fs is able to handle
+ */
+ if ((((loff_t)pgoff) << PAGE_SHIFT) + len >
+ inode->i_sb->s_maxbytes)
+ return -EINVAL;
+
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
diff --git a/mm/readahead.c b/mm/readahead.c
index c9c50ca..8ec8d4a 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -131,7 +131,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
LIST_HEAD(page_pool);
int page_idx;
int ret = 0;
- loff_t isize = i_size_read(inode);
+ loff_t isize = i_size_read_trunc(inode);

if (isize == 0)
goto out;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f071648..88d3bb1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1082,7 +1082,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
*/
probe_block = 0;
page_no = 0;
- last_block = i_size_read(inode) >> blkbits;
+ last_block = i_size_read_trunc(inode) >> blkbits;
while ((probe_block + blocks_per_page) <= last_block &&
page_no < sis->max) {
unsigned block_in_page;
@@ -1517,7 +1517,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
goto bad_swap;
}

- swapfilesize = i_size_read(inode) >> PAGE_SHIFT;
+ swapfilesize = i_size_read_trunc(inode) >> PAGE_SHIFT;

/*
* Read the swap header.

2007-12-20 18:00:35

by Mark Fasheh

[permalink] [raw]

Subject: Re: [PATCH] Handle i_size > s_maxbytes correctly

On Thu, Dec 20, 2007 at 06:51:04PM +0100, Jan Kara wrote:
> Hi Andrew,
>
> nobody seemed to care about this patch, so could you pull it into -mm so
> that it gets broader testing? Thanks.

Oh, this can get my ack btw:

Acked-by: Mark Fasheh <[email protected]>
--Mark

--
Mark Fasheh
Principal Software Developer, Oracle
[email protected]

2007-12-22 08:12:35

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH] Handle i_size > s_maxbytes correctly

On Thu, 20 Dec 2007 18:51:04 +0100 Jan Kara <[email protected]> wrote:

> Although we don't allow writes over s_maxbytes, it can happen that a file's
> size is larger than s_maxbytes. For example we can write the file from a
> computer with a different architecture (which has larger s_maxbytes), boot
> a kernel with a different set of config options (CONFIG_LBD...), or if two
> nodes in a [Ocfs2, and likely Gfs2] cluster have mounted the same file
> system and have different s_maxbytes. Thus we have to make sure we don't
> crash / corrupt data when seeing such file (page offset of the last page
> needn't fit into pgoff_t). Firstly, we make read() and mmap() return error
> when user tries to access the file above s_maxbytes, secondly we introduce
> a function i_size_read_trunc() which returns min(i_size, s_maxbytes) and
> use it when determining maximal page offset we are interested in.
>
> ...
>
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -1623,7 +1623,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
>
> BUG_ON(!PageLocked(page));
>
> - last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
> + last_block = (i_size_read_trunc(inode) - 1) >> inode->i_blkbits;
>
> ...
>
> +/* Truncate i_size at s_maxbytes so that pagecache doesn't have problems */
> +static inline loff_t i_size_read_trunc(const struct inode *inode)
> +{
> + loff_t i_size = i_size_read(inode);
> +
> + if (unlikely(inode->i_sb->s_maxbytes < i_size))
> + return inode->i_sb->s_maxbytes;
> + return i_size;
> +}
> +

This patch takes the total text size of the affected nine files from 74167
bytes up to 75066 on i386. This is core, core kernel. Ouch.

It's also pretty fragile. We now have i_size_read()s and
i_size_read_trunc()s sprinkled all over the place with no obvious rules to
determine when we should use one versus the other.

uninlining i_size_read_trunc() is obviously the first thing to look at but
the cost is still appreciable and boy the problem which is being fixed here
is rare and obscure.

Can we look at alternatives please? What about just failing the open
attempt?

2007-12-22 20:04:58

by Mark Fasheh

[permalink] [raw]

Subject: Re: [PATCH] Handle i_size > s_maxbytes correctly

On Sat, Dec 22, 2007 at 12:12:06AM -0800, Andrew Morton wrote:
> This patch takes the total text size of the affected nine files from 74167
> bytes up to 75066 on i386. This is core, core kernel. Ouch.

Yeah, as you note below - this should be un-inlined.

> It's also pretty fragile. We now have i_size_read()s and
> i_size_read_trunc()s sprinkled all over the place with no obvious rules to
> determine when we should use one versus the other.

Hmm, I hadn't thought about that one, but what you say makes sense. If I was
a 3rd party looking at this, I'd be pretty confused as to which i_size
function to use, etc.

> uninlining i_size_read_trunc() is obviously the first thing to look at but
> the cost is still appreciable and boy the problem which is being fixed here
> is rare and obscure.
>
> Can we look at alternatives please? What about just failing the open
> attempt?

The problem is that Ocfs2 can have this happen while a file is open (if a 64
bit node extends it past what the 32 bit node can see while they both have
open file descriptors). Even in Ocfs2 by the way, this is a bit rare - most
of the time folks run nodes of the same architecture. We still allow mixed
arch clusters though.

We could stop the VFS changes at some open checks and leave the rest
internally in Ocfs2 [and GFS2 I'm betting]. Generally though, it seems like
the type of thing that the VFS could help more with.

Jan, what if we just define a helper function which checks that a particular
file access is within the bounds allowed by the file system:

int check_file_access_offsets(struct inode *inode, loff_t pos, size_t count);

Something like that would be potentially better documenting than another
i_size accessor. Also, we could try to just leave it in the higher level
functions (write, read, splice, fault, mkwrite). I'm still not convinced
that we'd need to do anything in readpage/writepage if we just catch the
problem higher up.

Thanks,
--Mark

--
Mark Fasheh
Principal Software Developer, Oracle
[email protected]

2008-01-07 16:40:00

by Jan Kara

[permalink] [raw]

Subject: Re: [PATCH] Handle i_size > s_maxbytes correctly

On Sat 22-12-07 00:12:06, Andrew Morton wrote:
Sorry for a late reply but I was on vacation.

> On Thu, 20 Dec 2007 18:51:04 +0100 Jan Kara <[email protected]> wrote:
>
> > Although we don't allow writes over s_maxbytes, it can happen that a file's
> > size is larger than s_maxbytes. For example we can write the file from a
> > computer with a different architecture (which has larger s_maxbytes), boot
> > a kernel with a different set of config options (CONFIG_LBD...), or if two
> > nodes in a [Ocfs2, and likely Gfs2] cluster have mounted the same file
> > system and have different s_maxbytes. Thus we have to make sure we don't
> > crash / corrupt data when seeing such file (page offset of the last page
> > needn't fit into pgoff_t). Firstly, we make read() and mmap() return error
> > when user tries to access the file above s_maxbytes, secondly we introduce
> > a function i_size_read_trunc() which returns min(i_size, s_maxbytes) and
> > use it when determining maximal page offset we are interested in.
> >
> > ...
> >
> > --- a/fs/buffer.c
> > +++ b/fs/buffer.c
> > @@ -1623,7 +1623,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
> >
> > BUG_ON(!PageLocked(page));
> >
> > - last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
> > + last_block = (i_size_read_trunc(inode) - 1) >> inode->i_blkbits;
> >
> > ...
> >
> > +/* Truncate i_size at s_maxbytes so that pagecache doesn't have problems */
> > +static inline loff_t i_size_read_trunc(const struct inode *inode)
> > +{
> > + loff_t i_size = i_size_read(inode);
> > +
> > + if (unlikely(inode->i_sb->s_maxbytes < i_size))
> > + return inode->i_sb->s_maxbytes;
> > + return i_size;
> > +}
> > +
>
> This patch takes the total text size of the affected nine files from 74167
> bytes up to 75066 on i386. This is core, core kernel. Ouch.
>
> It's also pretty fragile. We now have i_size_read()s and
> i_size_read_trunc()s sprinkled all over the place with no obvious rules to
> determine when we should use one versus the other.
Looking at the patch from the distance of two weeks I agree this is a
flaw...

> uninlining i_size_read_trunc() is obviously the first thing to look at but
> the cost is still appreciable and boy the problem which is being fixed here
> is rare and obscure.
>
> Can we look at alternatives please? What about just failing the open
> attempt?
As Mark wrote, just failing the open does not solve the problem for
clustered filesystems. I'll try to come up with something that would be
a less fragile solution (and won't increase the text size that much).

Thanks for your comments

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2008-01-07 16:52:55

by Jan Kara

[permalink] [raw]

Subject: Re: [PATCH] Handle i_size > s_maxbytes correctly

On Sat 22-12-07 12:03:10, Mark Fasheh wrote:
> On Sat, Dec 22, 2007 at 12:12:06AM -0800, Andrew Morton wrote:
> > This patch takes the total text size of the affected nine files from 74167
> > bytes up to 75066 on i386. This is core, core kernel. Ouch.
>
> Yeah, as you note below - this should be un-inlined.
>
>
> > It's also pretty fragile. We now have i_size_read()s and
> > i_size_read_trunc()s sprinkled all over the place with no obvious rules to
> > determine when we should use one versus the other.
>
> Hmm, I hadn't thought about that one, but what you say makes sense. If I was
> a 3rd party looking at this, I'd be pretty confused as to which i_size
> function to use, etc.
>
>
> > uninlining i_size_read_trunc() is obviously the first thing to look at but
> > the cost is still appreciable and boy the problem which is being fixed here
> > is rare and obscure.
> >
> > Can we look at alternatives please? What about just failing the open
> > attempt?
>
> The problem is that Ocfs2 can have this happen while a file is open (if a 64
> bit node extends it past what the 32 bit node can see while they both have
> open file descriptors). Even in Ocfs2 by the way, this is a bit rare - most
> of the time folks run nodes of the same architecture. We still allow mixed
> arch clusters though.
>
> We could stop the VFS changes at some open checks and leave the rest
> internally in Ocfs2 [and GFS2 I'm betting]. Generally though, it seems like
> the type of thing that the VFS could help more with.
>
>
> Jan, what if we just define a helper function which checks that a particular
> file access is within the bounds allowed by the file system:
>
> int check_file_access_offsets(struct inode *inode, loff_t pos, size_t count);
>
> Something like that would be potentially better documenting than another
> i_size accessor. Also, we could try to just leave it in the higher level
> functions (write, read, splice, fault, mkwrite). I'm still not convinced
> that we'd need to do anything in readpage/writepage if we just catch the
> problem higher up.
Actually there are two problems. The first one is calls from userspace
beyond s_maxbytes. Those can be (and are) easily handled. But then there
are places inside VFS which compute things like maximal page index in the
file (and they compute it regardless the actual place where we access the
file). These computations can overflow even if the access is perfectly
within s_maxbytes and cause nasty things. That's why I've introduced that
i_size_read_trunc() function and we really need to fix them somehow for
OCFS2 or we have to make sure that these functions are never called on
files larger than s_maxbytes. I'll try to come up with more obvious
alternative to i_size_read_trunc() and if I fail I'll investigate how hard
it would be to handle the problems inside OCFS2.

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2008-01-09 19:46:15

by Jan Kara

[permalink] [raw]

Subject: Re: [PATCH] Handle i_size > s_maxbytes correctly

Hi Andrew,

On Sat 22-12-07 00:12:06, Andrew Morton wrote:
> On Thu, 20 Dec 2007 18:51:04 +0100 Jan Kara <[email protected]> wrote:
>
> > Although we don't allow writes over s_maxbytes, it can happen that a file's
> > size is larger than s_maxbytes. For example we can write the file from a
> > computer with a different architecture (which has larger s_maxbytes), boot
> > a kernel with a different set of config options (CONFIG_LBD...), or if two
> > nodes in a [Ocfs2, and likely Gfs2] cluster have mounted the same file
> > system and have different s_maxbytes. Thus we have to make sure we don't
> > crash / corrupt data when seeing such file (page offset of the last page
> > needn't fit into pgoff_t). Firstly, we make read() and mmap() return error
> > when user tries to access the file above s_maxbytes, secondly we introduce
> > a function i_size_read_trunc() which returns min(i_size, s_maxbytes) and
> > use it when determining maximal page offset we are interested in.
> >
> > ...
> >
> > --- a/fs/buffer.c
> > +++ b/fs/buffer.c
> > @@ -1623,7 +1623,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
> >
> > BUG_ON(!PageLocked(page));
> >
> > - last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
> > + last_block = (i_size_read_trunc(inode) - 1) >> inode->i_blkbits;
> >
> > ...
> >
> > +/* Truncate i_size at s_maxbytes so that pagecache doesn't have problems */
> > +static inline loff_t i_size_read_trunc(const struct inode *inode)
> > +{
> > + loff_t i_size = i_size_read(inode);
> > +
> > + if (unlikely(inode->i_sb->s_maxbytes < i_size))
> > + return inode->i_sb->s_maxbytes;
> > + return i_size;
> > +}
> > +
>
> This patch takes the total text size of the affected nine files from 74167
> bytes up to 75066 on i386. This is core, core kernel. Ouch.
>
> It's also pretty fragile. We now have i_size_read()s and
> i_size_read_trunc()s sprinkled all over the place with no obvious rules to
> determine when we should use one versus the other.
>
> uninlining i_size_read_trunc() is obviously the first thing to look at but
> the cost is still appreciable and boy the problem which is being fixed here
> is rare and obscure.
>
> Can we look at alternatives please? What about just failing the open
> attempt?
So I've given some more time to this problem. The patch in the end of
this email is the result. Instead of introducing i_size_read_trunc()
function, it introduces two functions for conversion of an offset into
index / block number and these functions handle overflows. The impact on
the text size is a bit smaller now (600 bytes on i386 without LFS, 74 bytes
on x86_64), impact on run time should be smaller (we don't have to grab
s_maxbytes) and use of these functions should be more obvious as well
(whenever you convert an offset to a page index / block and the conversion
can overflow, you can use these auxiliary functions). I agree this is still
not ideal (given the rarity of the problem) so below I discuss another
possibility.
Don't allow files with i_size > s_maxbytes in VFS at all. For local
filesystems we can just check this on open and everything is fine but with
remote filesystems such as OCFS2 (or even NFS) filesize can be changed on
the fly from a different machine. So to avoid problems we can either
introduce some locking to prevent changes of i_size from other machines
while we are in critical sections (awww, I really don't think this is
better) or truncate i_size to s_maxbytes when we update i_size from what
we've received via network / shared storage (but then we'd have to track
whether user truncated file to some size or whether fs truncated it just
for safety and apps could be confused too).
Any suggestions for further improvement or other solutions welcome :).

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
---

Although we don't allow writes over s_maxbytes, it can happen that a file's
size is larger than s_maxbytes. For example we can write the file from a
computer with a different architecture (which has larger s_maxbytes), boot a
kernel with a different set of config options (CONFIG_LBD...), or if two nodes
in a [Ocfs2, and likely Gfs2] cluster have mounted the same file system and
have different s_maxbytes. Thus we have to make sure we don't crash / corrupt
data when seeing such file (page offset of the last page needn't fit into
pgoff_t, block offset into sector_t). Firstly, we make read() and mmap() return
error when user tries to access the file above s_maxbytes, secondly we
introduce functions offset_to_index_max() and offset_to_block_max() which
return maximum number fitting into pgoff_t or sector_t, respectively, in
case the number computed from i_size would overflow.

Signed-off-by: Jan Kara <[email protected]>
CC: Mark Fasheh <[email protected]>

diff --git a/fs/buffer.c b/fs/buffer.c
index 7249e01..4323a6d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1623,7 +1623,8 @@ static int __block_write_full_page(struct inode *inode, struct page *page,

BUG_ON(!PageLocked(page));

- last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
+ last_block = offset_to_block_max(i_size_read(inode) - 1,
+ inode->i_blkbits);

if (!page_has_buffers(page)) {
create_empty_buffers(page, blocksize,
@@ -2084,7 +2085,8 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
head = page_buffers(page);

iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
+ lblock = offset_to_block_max(i_size_read(inode)+blocksize-1,
+ inode->i_blkbits);
bh = head;
nr = 0;
i = 0;
@@ -2604,7 +2606,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
{
struct inode * const inode = page->mapping->host;
loff_t i_size = i_size_read(inode);
- const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ const pgoff_t end_index = offset_to_index_max(i_size);
unsigned offset;
int ret;

@@ -2804,7 +2806,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
{
struct inode * const inode = page->mapping->host;
loff_t i_size = i_size_read(inode);
- const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ const pgoff_t end_index = offset_to_index_max(i_size);
unsigned offset;

/* Is the page fully inside i_size? */
diff --git a/fs/mpage.c b/fs/mpage.c
index d54f8f8..4d1d12c 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -190,7 +190,8 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,

block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
- last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+ last_block_in_file = offset_to_block_max(i_size_read(inode) +
+ blocksize - 1, blkbits);
if (last_block > last_block_in_file)
last_block = last_block_in_file;
page_block = 0;
@@ -526,7 +527,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
*/
BUG_ON(!PageUptodate(page));
block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
- last_block = (i_size - 1) >> blkbits;
+ last_block = offset_to_block_max(i_size - 1, blkbits);
map_bh.b_page = page;
for (page_block = 0; page_block < blocks_per_page; ) {

@@ -557,7 +558,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
first_unmapped = page_block;

page_is_mapped:
- end_index = i_size >> PAGE_CACHE_SHIFT;
+ end_index = offset_to_index_max(i_size);
if (page->index >= end_index) {
/*
* The page straddles i_size. It must be zeroed out on each
diff --git a/fs/read_write.c b/fs/read_write.c
index ea1f94c..ed91acc 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -16,6 +16,7 @@
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
+#include <linux/mount.h>
#include "read_write.h"

#include <asm/uaccess.h>
@@ -263,6 +264,11 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
return -EINVAL;
if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
return -EFAULT;
+ if (unlikely(*pos + count > file->f_vfsmnt->mnt_sb->s_maxbytes)) {
+ if (*pos >= file->f_vfsmnt->mnt_sb->s_maxbytes)
+ return -EOVERFLOW;
+ count = file->f_vfsmnt->mnt_sb->s_maxbytes - *pos;
+ }

ret = rw_verify_area(READ, file, pos, count);
if (ret >= 0) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b3ec4a4..826e718 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1394,6 +1394,17 @@ static inline void inode_dec_link_count(struct inode *inode)
mark_inode_dirty(inode);
}

+/* Convert byte offset to block number, if number would overflow
+ * sector_t, return maximum value fitting there. */
+static inline sector_t offset_to_block_max(loff_t off, int shift)
+{
+ loff_t block = off >> shift;
+
+ if (unlikely(block != (sector_t)block))
+ return ~(sector_t)0;
+ return block;
+}
+
extern void touch_atime(struct vfsmount *mnt, struct dentry *dentry);
static inline void file_accessed(struct file *file)
{
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index db8a410..adfd195 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -148,6 +148,19 @@ static inline loff_t page_offset(struct page *page)
return ((loff_t)page->index) << PAGE_CACHE_SHIFT;
}

+/*
+ * Return index of a page on given offset, if index would overflow pgoff_t,
+ * return the maximum number fitting there.
+ */
+static inline pgoff_t offset_to_index_max(loff_t off)
+{
+ loff_t index = off >> PAGE_CACHE_SHIFT;
+
+ if (unlikely(index != (pgoff_t)index))
+ return ~((pgoff_t)0);
+ return index;
+}
+
static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
unsigned long address)
{
diff --git a/mm/filemap.c b/mm/filemap.c
index 01e260f..e37be0c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -380,7 +380,7 @@ int filemap_fdatawait(struct address_space *mapping)
return 0;

return wait_on_page_writeback_range(mapping, 0,
- (i_size - 1) >> PAGE_CACHE_SHIFT);
+ offset_to_index_max(i_size - 1));
}
EXPORT_SYMBOL(filemap_fdatawait);

@@ -925,7 +925,7 @@ page_ok:
*/

isize = i_size_read(inode);
- end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ end_index = offset_to_index_max(isize - 1);
if (unlikely(!isize || index > end_index)) {
page_cache_release(page);
goto out;
@@ -1310,7 +1310,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
int did_readaround = 0;
int ret = 0;

- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ size = offset_to_index_max(i_size_read(inode) + PAGE_CACHE_SIZE - 1);
if (vmf->pgoff >= size)
return VM_FAULT_SIGBUS;

@@ -1385,7 +1385,7 @@ retry_find:
goto page_not_uptodate;

/* Must recheck i_size under page lock */
- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ size = offset_to_index_max(i_size_read(inode) + PAGE_CACHE_SIZE - 1);
if (unlikely(vmf->pgoff >= size)) {
unlock_page(page);
page_cache_release(page);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 5fa3fbf..e264b37 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -68,7 +68,7 @@ do_xip_mapping_read(struct address_space *mapping,
if (!isize)
goto out;

- end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ end_index = offset_to_index_max(isize - 1);
for (;;) {
struct page *page;
unsigned long nr, ret;
@@ -220,7 +220,7 @@ static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf)

/* XXX: are VM_FAULT_ codes OK? */

- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ size = offset_to_index_max(i_size_read(inode) + PAGE_CACHE_SIZE - 1);
if (vmf->pgoff >= size)
return VM_FAULT_SIGBUS;

diff --git a/mm/mmap.c b/mm/mmap.c
index 15678aa..e28ca6e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -983,6 +983,13 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
if (locks_verify_locked(inode))
return -EAGAIN;

+ /*
+ * Make sure we don't map more than fs is able to handle
+ */
+ if ((((loff_t)pgoff) << PAGE_SHIFT) + len >
+ inode->i_sb->s_maxbytes)
+ return -EINVAL;
+
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
diff --git a/mm/readahead.c b/mm/readahead.c
index c9c50ca..67ccd91 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -136,7 +136,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
if (isize == 0)
goto out;

- end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+ end_index = offset_to_index_max(isize - 1);

/*
* Preallocate as many pages as we will need.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f071648..0951c7a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1082,7 +1082,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
*/
probe_block = 0;
page_no = 0;
- last_block = i_size_read(inode) >> blkbits;
+ last_block = offset_to_block_max(i_size_read(inode), blkbits);
while ((probe_block + blocks_per_page) <= last_block &&
page_no < sis->max) {
unsigned block_in_page;
@@ -1517,6 +1517,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
goto bad_swap;
}

+ /* This can possibly overflow but we discover that later */
swapfilesize = i_size_read(inode) >> PAGE_SHIFT;

/*