2006-11-30 07:21:02

by Nick Piggin

[permalink] [raw]
Subject: [patch 1/3] mm: pagecache write deadlocks zerolength fix


writev with a zero-length segment is a noop, and we shouldn't return EFAULT.

Signed-off-by: Nick Piggin <[email protected]>

Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h
+++ linux-2.6/include/linux/pagemap.h
@@ -198,6 +198,9 @@ static inline int fault_in_pages_writeab
{
int ret;

+ if (unlikely(size == 0))
+ return 0;
+
/*
* Writing zeroes into userspace here is OK, because we know that if
* the zero gets there, we'll be overwriting it.
@@ -222,6 +225,9 @@ static inline int fault_in_pages_readabl
volatile char c;
int ret;

+ if (unlikely(size == 0))
+ return 0;
+
ret = __get_user(c, uaddr);
if (ret == 0) {
const char __user *end = uaddr + size - 1;


2006-11-30 07:22:09

by Nick Piggin

[permalink] [raw]
Subject: [patch 2/3] mm: pagecache write deadlocks stale holes fix


If the data copy within a prepare_write can potentially allocate blocks
to fill holes, so if the page copy fails then new blocks must be zeroed
so uninitialised data cannot be exposed with a subsequent read.

Signed-off-by: Nick Piggin <[email protected]>

Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -1951,7 +1951,14 @@ retry_noprogress:
bytes);
dec_preempt_count();

- if (!PageUptodate(page)) {
+ if (unlikely(copied != bytes)) {
+ /*
+ * Must zero out new buffers here so that we do end
+ * up properly filling holes rather than leaving stale
+ * data in them that might be read in future.
+ */
+ page_zero_new_buffers(page);
+
/*
* If the page is not uptodate, we cannot allow a
* partial commit_write because when we unlock the
@@ -1965,10 +1972,10 @@ retry_noprogress:
* Abort the operation entirely with a zero length
* commit_write. Retry. We will enter the
* single-segment path below, which should get the
- * filesystem to bring the page uputodate for us next
+ * filesystem to bring the page uptodate for us next
* time.
*/
- if (unlikely(copied != bytes))
+ if (!PageUptodate(page))
copied = 0;
}

Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c
+++ linux-2.6/fs/buffer.c
@@ -1491,6 +1491,39 @@ out:
}
EXPORT_SYMBOL(block_invalidatepage);

+void page_zero_new_buffers(struct page *page)
+{
+ unsigned int block_start, block_end;
+ struct buffer_head *head, *bh;
+
+ BUG_ON(!PageLocked(page));
+ if (!page_has_buffers(page))
+ return;
+
+ bh = head = page_buffers(page);
+ block_start = 0;
+ do {
+ block_end = block_start + bh->b_size;
+
+ if (buffer_new(bh)) {
+ void *kaddr;
+
+ if (!PageUptodate(page)) {
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr+block_start, 0, bh->b_size);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+ clear_buffer_new(bh);
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ }
+
+ block_start = block_end;
+ bh = bh->b_this_page;
+ } while (bh != head);
+}
+
/*
* We attach and possibly dirty the buffers atomically wrt
* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
@@ -1784,36 +1817,33 @@ static int __block_prepare_write(struct
}
continue;
}
- if (buffer_new(bh))
- clear_buffer_new(bh);
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, block, bh, 1);
if (err)
break;
- if (buffer_new(bh)) {
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
- if (PageUptodate(page)) {
- set_buffer_uptodate(bh);
- continue;
- }
- if (block_end > to || block_start < from) {
- void *kaddr;
-
- kaddr = kmap_atomic(page, KM_USER0);
- if (block_end > to)
- memset(kaddr+to, 0,
- block_end-to);
- if (block_start < from)
- memset(kaddr+block_start,
- 0, from-block_start);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
- }
+ }
+ if (buffer_new(bh)) {
+ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ if (PageUptodate(page)) {
+ set_buffer_uptodate(bh);
continue;
}
+ if (block_end > to || block_start < from) {
+ void *kaddr;
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ if (block_end > to)
+ memset(kaddr+to, 0, block_end-to);
+ if (block_start < from)
+ memset(kaddr+block_start,
+ 0, from-block_start);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+ continue;
}
+
if (PageUptodate(page)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
@@ -1833,43 +1863,10 @@ static int __block_prepare_write(struct
if (!buffer_uptodate(*wait_bh))
err = -EIO;
}
- if (!err) {
- bh = head;
- do {
- if (buffer_new(bh))
- clear_buffer_new(bh);
- } while ((bh = bh->b_this_page) != head);
- return 0;
- }
- /* Error case: */
- /*
- * Zero out any newly allocated blocks to avoid exposing stale
- * data. If BH_New is set, we know that the block was newly
- * allocated in the above loop.
- */
- bh = head;
- block_start = 0;
- do {
- block_end = block_start+blocksize;
- if (block_end <= from)
- goto next_bh;
- if (block_start >= to)
- break;
- if (buffer_new(bh)) {
- void *kaddr;

- clear_buffer_new(bh);
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr+block_start, 0, bh->b_size);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
- set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
- }
-next_bh:
- block_start = block_end;
- bh = bh->b_this_page;
- } while (bh != head);
+ if (err)
+ page_zero_new_buffers(page);
+
return err;
}

@@ -2246,7 +2243,6 @@ int nobh_prepare_write(struct page *page
int i;
int ret = 0;
int is_mapped_to_disk = 1;
- int dirtied_it = 0;

if (PageMappedToDisk(page))
return 0;
@@ -2282,15 +2278,16 @@ int nobh_prepare_write(struct page *page
if (PageUptodate(page))
continue;
if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
+ /*
+ * XXX: stale data can be exposed as we are not zeroing
+ * out newly allocated blocks. If a subsequent operation
+ * fails, we'll never know about this :(
+ */
kaddr = kmap_atomic(page, KM_USER0);
- if (block_start < from) {
- memset(kaddr+block_start, 0, from-block_start);
- dirtied_it = 1;
- }
- if (block_end > to) {
+ if (block_start < from)
+ memset(kaddr+block_start, 0, block_end-block_start);
+ if (block_end > to)
memset(kaddr + to, 0, block_end - to);
- dirtied_it = 1;
- }
flush_dcache_page(page);
kunmap_atomic(kaddr, KM_USER0);
continue;
Index: linux-2.6/include/linux/buffer_head.h
===================================================================
--- linux-2.6.orig/include/linux/buffer_head.h
+++ linux-2.6/include/linux/buffer_head.h
@@ -151,6 +151,7 @@ struct buffer_head *alloc_page_buffers(s
int retry);
void create_empty_buffers(struct page *, unsigned long,
unsigned long b_state);
+void page_zero_new_buffers(struct page *page);
void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
void end_buffer_write_sync(struct buffer_head *bh, int uptodate);

Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h
+++ linux-2.6/include/linux/pagemap.h
@@ -206,7 +206,7 @@ static inline int fault_in_pages_writeab
* the zero gets there, we'll be overwriting it.
*/
ret = __put_user(0, uaddr);
- if (ret == 0) {
+ if (likely(ret == 0)) {
char __user *end = uaddr + size - 1;

/*
@@ -229,7 +229,7 @@ static inline int fault_in_pages_readabl
return 0;

ret = __get_user(c, uaddr);
- if (ret == 0) {
+ if (likely(ret == 0)) {
const char __user *end = uaddr + size - 1;

if (((unsigned long)uaddr & PAGE_MASK) !=

2006-11-30 07:22:51

by Nick Piggin

[permalink] [raw]
Subject: [patch 3/3] fs: fix cont vs deadlock patches


Rework the cont filesystem helpers so that generic_cont_expand does the
actual work of expanding the file. cont_prepare_write then calls this
routine if expanding is needed, and retries. Also solves the problem
where cont_prepare_write would previously hold the target page locked
while doing not-very-nice things like locking other pages.

Means that zero-length prepare/commit_write pairs are no longer needed
as an overloaded directive to extend the file, thus cont should operate
better within the new deadlock-free buffered write code.

Converts fat over to the new cont scheme.

Signed-off-by: Nick Piggin <[email protected]>


Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c
+++ linux-2.6/fs/buffer.c
@@ -2004,19 +2004,20 @@ int block_read_full_page(struct page *pa
return 0;
}

-/* utility function for filesystems that need to do work on expanding
- * truncates. Uses prepare/commit_write to allow the filesystem to
- * deal with the hole.
+/*
+ * Utility function for filesystems that need to do work on expanding
+ * truncates. For moronic filesystems that do not allow holes in file.
*/
-static int __generic_cont_expand(struct inode *inode, loff_t size,
- pgoff_t index, unsigned int offset)
+int generic_cont_expand(struct inode *inode, loff_t size, loff_t *bytes,
+ get_block_t *get_block)
{
struct address_space *mapping = inode->i_mapping;
+ unsigned long blocksize = 1 << inode->i_blkbits;
struct page *page;
unsigned long limit;
- int err;
+ int status;

- err = -EFBIG;
+ status = -EFBIG;
limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
if (limit != RLIM_INFINITY && size > (loff_t)limit) {
send_sig(SIGXFSZ, current, 0);
@@ -2025,146 +2026,83 @@ static int __generic_cont_expand(struct
if (size > inode->i_sb->s_maxbytes)
goto out;

- err = -ENOMEM;
- page = grab_cache_page(mapping, index);
- if (!page)
- goto out;
- err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
- if (err) {
- /*
- * ->prepare_write() may have instantiated a few blocks
- * outside i_size. Trim these off again.
- */
- unlock_page(page);
- page_cache_release(page);
- vmtruncate(inode, inode->i_size);
- goto out;
- }
+ status = 0;

- err = mapping->a_ops->commit_write(NULL, page, offset, offset);
+ while (*bytes < size) {
+ unsigned int zerofrom;
+ unsigned int zeroto;
+ void *kaddr;
+ pgoff_t pgpos;
+
+ pgpos = *bytes >> PAGE_CACHE_SHIFT;
+ page = grab_cache_page(mapping, pgpos);
+ if (!page) {
+ status = -ENOMEM;
+ break;
+ }
+ /* we might sleep */
+ if (*bytes >> PAGE_CACHE_SHIFT != pgpos)
+ goto unlock;

- unlock_page(page);
- page_cache_release(page);
- if (err > 0)
- err = 0;
-out:
- return err;
-}
+ zerofrom = *bytes & ~PAGE_CACHE_MASK;
+ if (zerofrom & (blocksize-1))
+ *bytes = (*bytes + blocksize-1) & (blocksize-1);

-int generic_cont_expand(struct inode *inode, loff_t size)
-{
- pgoff_t index;
- unsigned int offset;
+ zeroto = PAGE_CACHE_SIZE;

- offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
+ status = __block_prepare_write(inode, page, zerofrom,
+ zeroto, get_block);
+ if (status)
+ goto unlock;
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ status = __block_commit_write(inode, page, zerofrom, zeroto);

- /* ugh. in prepare/commit_write, if from==to==start of block, we
- ** skip the prepare. make sure we never send an offset for the start
- ** of a block
- */
- if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
- /* caller must handle this extra byte. */
- offset++;
+unlock:
+ unlock_page(page);
+ page_cache_release(page);
+ if (status) {
+ BUG_ON(status == AOP_TRUNCATED_PAGE);
+ break;
+ }
}
- index = size >> PAGE_CACHE_SHIFT;

- return __generic_cont_expand(inode, size, index, offset);
-}
-
-int generic_cont_expand_simple(struct inode *inode, loff_t size)
-{
- loff_t pos = size - 1;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
+ /*
+ * No need to use i_size_read() here, the i_size
+ * cannot change under us because we hold i_mutex.
+ */
+ if (size > inode->i_size) {
+ i_size_write(inode, size);
+ mark_inode_dirty(inode);
+ }

- /* prepare/commit_write can handle even if from==to==start of block. */
- return __generic_cont_expand(inode, size, index, offset);
+out:
+ return status;
}

-/*
- * For moronic filesystems that do not allow holes in file.
- * We may have to extend the file.
- */
-
int cont_prepare_write(struct page *page, unsigned offset,
unsigned to, get_block_t *get_block, loff_t *bytes)
{
- struct address_space *mapping = page->mapping;
- struct inode *inode = mapping->host;
- struct page *new_page;
- pgoff_t pgpos;
- long status;
- unsigned zerofrom;
- unsigned blocksize = 1 << inode->i_blkbits;
- void *kaddr;
-
- while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
- status = -ENOMEM;
- new_page = grab_cache_page(mapping, pgpos);
- if (!new_page)
- goto out;
- /* we might sleep */
- if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
- unlock_page(new_page);
- page_cache_release(new_page);
- continue;
- }
- zerofrom = *bytes & ~PAGE_CACHE_MASK;
- if (zerofrom & (blocksize-1)) {
- *bytes |= (blocksize-1);
- (*bytes)++;
- }
- status = __block_prepare_write(inode, new_page, zerofrom,
- PAGE_CACHE_SIZE, get_block);
- if (status)
- goto out_unmap;
- kaddr = kmap_atomic(new_page, KM_USER0);
- memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
- flush_dcache_page(new_page);
- kunmap_atomic(kaddr, KM_USER0);
- generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
- unlock_page(new_page);
- page_cache_release(new_page);
- }
-
- if (page->index < pgpos) {
- /* completely inside the area */
- zerofrom = offset;
- } else {
- /* page covers the boundary, find the boundary offset */
- zerofrom = *bytes & ~PAGE_CACHE_MASK;
-
- /* if we will expand the thing last block will be filled */
- if (to > zerofrom && (zerofrom & (blocksize-1))) {
- *bytes |= (blocksize-1);
- (*bytes)++;
- }
+ loff_t size = (page->index << PAGE_CACHE_SHIFT) + offset;
+ struct inode *inode = page->mapping->host;
+ int err;

- /* starting below the boundary? Nothing to zero out */
- if (offset <= zerofrom)
- zerofrom = offset;
- }
- status = __block_prepare_write(inode, page, zerofrom, to, get_block);
- if (status)
- goto out1;
- if (zerofrom < offset) {
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr+zerofrom, 0, offset-zerofrom);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
- __block_commit_write(inode, page, zerofrom, offset);
+ if (*bytes < size) {
+ unlock_page(page);
+ err = generic_cont_expand(inode, size, bytes, get_block);
+ if (!err)
+ err = AOP_TRUNCATED_PAGE;
+ else
+ lock_page(page); /* caller expects this */
+ return err;
}
- return 0;
-out1:
- ClearPageUptodate(page);
- return status;

-out_unmap:
- ClearPageUptodate(new_page);
- unlock_page(new_page);
- page_cache_release(new_page);
-out:
- return status;
+ err = __block_prepare_write(inode, page, offset, to, get_block);
+ if (err)
+ ClearPageUptodate(page);
+ return err;
}

int block_prepare_write(struct page *page, unsigned from, unsigned to,
@@ -3015,7 +2953,6 @@ EXPORT_SYMBOL(fsync_bdev);
EXPORT_SYMBOL(generic_block_bmap);
EXPORT_SYMBOL(generic_commit_write);
EXPORT_SYMBOL(generic_cont_expand);
-EXPORT_SYMBOL(generic_cont_expand_simple);
EXPORT_SYMBOL(init_buffer);
EXPORT_SYMBOL(invalidate_bdev);
EXPORT_SYMBOL(ll_rw_block);
Index: linux-2.6/include/linux/buffer_head.h
===================================================================
--- linux-2.6.orig/include/linux/buffer_head.h
+++ linux-2.6/include/linux/buffer_head.h
@@ -202,8 +202,7 @@ int block_read_full_page(struct page*, g
int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*,
loff_t *);
-int generic_cont_expand(struct inode *inode, loff_t size);
-int generic_cont_expand_simple(struct inode *inode, loff_t size);
+int generic_cont_expand(struct inode *inode, loff_t size, loff_t *bytes, get_block_t *);
int block_commit_write(struct page *page, unsigned from, unsigned to);
void block_sync_page(struct page *);
sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
Index: linux-2.6/fs/fat/inode.c
===================================================================
--- linux-2.6.orig/fs/fat/inode.c
+++ linux-2.6/fs/fat/inode.c
@@ -151,11 +151,12 @@ static int fat_commit_write(struct file
{
struct inode *inode = page->mapping->host;
int err;
- if (to - from > 0)
- return 0;

err = generic_commit_write(file, page, from, to);
- if (!err && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
+ if (err)
+ return err;
+
+ if (!(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
mark_inode_dirty(inode);

2006-11-30 07:26:11

by Nick Piggin

[permalink] [raw]
Subject: [patch 0/3] more buffered write fixes

Sorry, I should give some background.

The following patches attempt to fix the problems people have identified
with buffered write deadlock patches. Against 2.6.19 + the previous patchset
dropped from -mm.

Comments?

2006-11-30 10:15:42

by Andreas Schwab

[permalink] [raw]
Subject: Re: [patch 1/3] mm: pagecache write deadlocks zerolength fix

Nick Piggin <[email protected]> writes:

> writev with a zero-length segment is a noop, and we shouldn't return EFAULT.

AFAICS the callers of these functions never pass a zero length.

Andreas.

--
Andreas Schwab, SuSE Labs, [email protected]
SuSE Linux Products GmbH, Maxfeldstra?e 5, 90409 N?rnberg, Germany
PGP key fingerprint = 58CA 54C7 6D53 942B 1756 01D3 44D5 214B 8276 4ED5
"And now for something completely different."

2006-11-30 10:19:36

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 1/3] mm: pagecache write deadlocks zerolength fix

On Thu, Nov 30, 2006 at 11:15:39AM +0100, Andreas Schwab wrote:
> Nick Piggin <[email protected]> writes:
>
> > writev with a zero-length segment is a noop, and we shouldn't return EFAULT.
>
> AFAICS the callers of these functions never pass a zero length.

They can in the case of a zero length write. I had considered also
doing this check in the caller, but I don't think it is too harmful
to make the API a little more robust? But if you have another
preference?

Thanks,
Nick

2006-11-30 10:30:37

by Andreas Schwab

[permalink] [raw]
Subject: Re: [patch 1/3] mm: pagecache write deadlocks zerolength fix

Nick Piggin <[email protected]> writes:

> On Thu, Nov 30, 2006 at 11:15:39AM +0100, Andreas Schwab wrote:
>> Nick Piggin <[email protected]> writes:
>>
>> > writev with a zero-length segment is a noop, and we shouldn't return EFAULT.
>>
>> AFAICS the callers of these functions never pass a zero length.
>
> They can in the case of a zero length write.

How? All (indirect) callers I could find explicitly handle the
zero-length case.

Andreas.

--
Andreas Schwab, SuSE Labs, [email protected]
SuSE Linux Products GmbH, Maxfeldstra?e 5, 90409 N?rnberg, Germany
PGP key fingerprint = 58CA 54C7 6D53 942B 1756 01D3 44D5 214B 8276 4ED5
"And now for something completely different."

2006-11-30 11:30:55

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 1/3] mm: pagecache write deadlocks zerolength fix

On Thu, Nov 30, 2006 at 11:30:33AM +0100, Andreas Schwab wrote:
> Nick Piggin <[email protected]> writes:
>
> > On Thu, Nov 30, 2006 at 11:15:39AM +0100, Andreas Schwab wrote:
> >> Nick Piggin <[email protected]> writes:
> >>
> >> > writev with a zero-length segment is a noop, and we shouldn't return EFAULT.
> >>
> >> AFAICS the callers of these functions never pass a zero length.
> >
> > They can in the case of a zero length write.
>
> How? All (indirect) callers I could find explicitly handle the
> zero-length case.

Sorry, zero length iov to writev (just had to double-check
there ;).

2006-11-30 11:32:46

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 3/3] fs: fix cont vs deadlock patches

Sorry, missed a hunk which prevented fat from linking :(

---

Rework the cont filesystem helpers so that generic_cont_expand does the
actual work of expanding the file. cont_prepare_write then calls this
routine if expanding is needed, and retries. Also solves the problem
where cont_prepare_write would previously hold the target page locked
while doing not-very-nice things like locking other pages.

Means that zero-length prepare/commit_write pairs are no longer needed
as an overloaded directive to extend the file, thus cont should operate
better within the new deadlock-free buffered write code.

Converts fat over to the new cont scheme.

Signed-off-by: Nick Piggin <[email protected]>


Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c
+++ linux-2.6/fs/buffer.c
@@ -2004,19 +2004,20 @@ int block_read_full_page(struct page *pa
return 0;
}

-/* utility function for filesystems that need to do work on expanding
- * truncates. Uses prepare/commit_write to allow the filesystem to
- * deal with the hole.
+/*
+ * Utility function for filesystems that need to do work on expanding
+ * truncates. For moronic filesystems that do not allow holes in file.
*/
-static int __generic_cont_expand(struct inode *inode, loff_t size,
- pgoff_t index, unsigned int offset)
+int generic_cont_expand(struct inode *inode, loff_t size, loff_t *bytes,
+ get_block_t *get_block)
{
struct address_space *mapping = inode->i_mapping;
+ unsigned long blocksize = 1 << inode->i_blkbits;
struct page *page;
unsigned long limit;
- int err;
+ int status;

- err = -EFBIG;
+ status = -EFBIG;
limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
if (limit != RLIM_INFINITY && size > (loff_t)limit) {
send_sig(SIGXFSZ, current, 0);
@@ -2025,146 +2026,83 @@ static int __generic_cont_expand(struct
if (size > inode->i_sb->s_maxbytes)
goto out;

- err = -ENOMEM;
- page = grab_cache_page(mapping, index);
- if (!page)
- goto out;
- err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
- if (err) {
- /*
- * ->prepare_write() may have instantiated a few blocks
- * outside i_size. Trim these off again.
- */
- unlock_page(page);
- page_cache_release(page);
- vmtruncate(inode, inode->i_size);
- goto out;
- }
+ status = 0;

- err = mapping->a_ops->commit_write(NULL, page, offset, offset);
+ while (*bytes < size) {
+ unsigned int zerofrom;
+ unsigned int zeroto;
+ void *kaddr;
+ pgoff_t pgpos;
+
+ pgpos = *bytes >> PAGE_CACHE_SHIFT;
+ page = grab_cache_page(mapping, pgpos);
+ if (!page) {
+ status = -ENOMEM;
+ break;
+ }
+ /* we might sleep */
+ if (*bytes >> PAGE_CACHE_SHIFT != pgpos)
+ goto unlock;

- unlock_page(page);
- page_cache_release(page);
- if (err > 0)
- err = 0;
-out:
- return err;
-}
+ zerofrom = *bytes & ~PAGE_CACHE_MASK;
+ if (zerofrom & (blocksize-1))
+ *bytes = (*bytes + blocksize-1) & (blocksize-1);

-int generic_cont_expand(struct inode *inode, loff_t size)
-{
- pgoff_t index;
- unsigned int offset;
+ zeroto = PAGE_CACHE_SIZE;

- offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
+ status = __block_prepare_write(inode, page, zerofrom,
+ zeroto, get_block);
+ if (status)
+ goto unlock;
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ status = __block_commit_write(inode, page, zerofrom, zeroto);

- /* ugh. in prepare/commit_write, if from==to==start of block, we
- ** skip the prepare. make sure we never send an offset for the start
- ** of a block
- */
- if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
- /* caller must handle this extra byte. */
- offset++;
+unlock:
+ unlock_page(page);
+ page_cache_release(page);
+ if (status) {
+ BUG_ON(status == AOP_TRUNCATED_PAGE);
+ break;
+ }
}
- index = size >> PAGE_CACHE_SHIFT;

- return __generic_cont_expand(inode, size, index, offset);
-}
-
-int generic_cont_expand_simple(struct inode *inode, loff_t size)
-{
- loff_t pos = size - 1;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
+ /*
+ * No need to use i_size_read() here, the i_size
+ * cannot change under us because we hold i_mutex.
+ */
+ if (size > inode->i_size) {
+ i_size_write(inode, size);
+ mark_inode_dirty(inode);
+ }

- /* prepare/commit_write can handle even if from==to==start of block. */
- return __generic_cont_expand(inode, size, index, offset);
+out:
+ return status;
}

-/*
- * For moronic filesystems that do not allow holes in file.
- * We may have to extend the file.
- */
-
int cont_prepare_write(struct page *page, unsigned offset,
unsigned to, get_block_t *get_block, loff_t *bytes)
{
- struct address_space *mapping = page->mapping;
- struct inode *inode = mapping->host;
- struct page *new_page;
- pgoff_t pgpos;
- long status;
- unsigned zerofrom;
- unsigned blocksize = 1 << inode->i_blkbits;
- void *kaddr;
-
- while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
- status = -ENOMEM;
- new_page = grab_cache_page(mapping, pgpos);
- if (!new_page)
- goto out;
- /* we might sleep */
- if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
- unlock_page(new_page);
- page_cache_release(new_page);
- continue;
- }
- zerofrom = *bytes & ~PAGE_CACHE_MASK;
- if (zerofrom & (blocksize-1)) {
- *bytes |= (blocksize-1);
- (*bytes)++;
- }
- status = __block_prepare_write(inode, new_page, zerofrom,
- PAGE_CACHE_SIZE, get_block);
- if (status)
- goto out_unmap;
- kaddr = kmap_atomic(new_page, KM_USER0);
- memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
- flush_dcache_page(new_page);
- kunmap_atomic(kaddr, KM_USER0);
- generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
- unlock_page(new_page);
- page_cache_release(new_page);
- }
-
- if (page->index < pgpos) {
- /* completely inside the area */
- zerofrom = offset;
- } else {
- /* page covers the boundary, find the boundary offset */
- zerofrom = *bytes & ~PAGE_CACHE_MASK;
-
- /* if we will expand the thing last block will be filled */
- if (to > zerofrom && (zerofrom & (blocksize-1))) {
- *bytes |= (blocksize-1);
- (*bytes)++;
- }
+ loff_t size = (page->index << PAGE_CACHE_SHIFT) + offset;
+ struct inode *inode = page->mapping->host;
+ int err;

- /* starting below the boundary? Nothing to zero out */
- if (offset <= zerofrom)
- zerofrom = offset;
- }
- status = __block_prepare_write(inode, page, zerofrom, to, get_block);
- if (status)
- goto out1;
- if (zerofrom < offset) {
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr+zerofrom, 0, offset-zerofrom);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
- __block_commit_write(inode, page, zerofrom, offset);
+ if (*bytes < size) {
+ unlock_page(page);
+ err = generic_cont_expand(inode, size, bytes, get_block);
+ if (!err)
+ err = AOP_TRUNCATED_PAGE;
+ else
+ lock_page(page); /* caller expects this */
+ return err;
}
- return 0;
-out1:
- ClearPageUptodate(page);
- return status;

-out_unmap:
- ClearPageUptodate(new_page);
- unlock_page(new_page);
- page_cache_release(new_page);
-out:
- return status;
+ err = __block_prepare_write(inode, page, offset, to, get_block);
+ if (err)
+ ClearPageUptodate(page);
+ return err;
}

int block_prepare_write(struct page *page, unsigned from, unsigned to,
@@ -3015,7 +2953,6 @@ EXPORT_SYMBOL(fsync_bdev);
EXPORT_SYMBOL(generic_block_bmap);
EXPORT_SYMBOL(generic_commit_write);
EXPORT_SYMBOL(generic_cont_expand);
-EXPORT_SYMBOL(generic_cont_expand_simple);
EXPORT_SYMBOL(init_buffer);
EXPORT_SYMBOL(invalidate_bdev);
EXPORT_SYMBOL(ll_rw_block);
Index: linux-2.6/include/linux/buffer_head.h
===================================================================
--- linux-2.6.orig/include/linux/buffer_head.h
+++ linux-2.6/include/linux/buffer_head.h
@@ -202,8 +202,7 @@ int block_read_full_page(struct page*, g
int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*,
loff_t *);
-int generic_cont_expand(struct inode *inode, loff_t size);
-int generic_cont_expand_simple(struct inode *inode, loff_t size);
+int generic_cont_expand(struct inode *inode, loff_t size, loff_t *bytes, get_block_t *);
int block_commit_write(struct page *page, unsigned from, unsigned to);
void block_sync_page(struct page *);
sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
Index: linux-2.6/fs/fat/inode.c
===================================================================
--- linux-2.6.orig/fs/fat/inode.c
+++ linux-2.6/fs/fat/inode.c
@@ -117,6 +117,25 @@ static int fat_get_block(struct inode *i
return 0;
}

+int fat_cont_expand(struct inode *inode, loff_t size)
+{
+ struct address_space *mapping = inode->i_mapping;
+ loff_t start = inode->i_size, count = size - inode->i_size;
+ int err;
+
+ err = generic_cont_expand(inode, size,
+ &MSDOS_I(inode)->mmu_private, fat_get_block);
+ if (err)
+ goto out;
+
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+ mark_inode_dirty(inode);
+ if (IS_SYNC(inode))
+ err = sync_page_range_nolock(inode, mapping, start, count);
+out:
+ return err;
+}
+
static int fat_writepage(struct page *page, struct writeback_control *wbc)
{
return block_write_full_page(page, fat_get_block, wbc);
@@ -151,11 +170,12 @@ static int fat_commit_write(struct file
{
struct inode *inode = page->mapping->host;
int err;
- if (to - from > 0)
- return 0;

err = generic_commit_write(file, page, from, to);
- if (!err && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
+ if (err)
+ return err;
+
+ if (!(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
mark_inode_dirty(inode);
Index: linux-2.6/fs/fat/file.c
===================================================================
--- linux-2.6.orig/fs/fat/file.c
+++ linux-2.6/fs/fat/file.c
@@ -137,24 +137,6 @@ const struct file_operations fat_file_op
.sendfile = generic_file_sendfile,
};

-static int fat_cont_expand(struct inode *inode, loff_t size)
-{
- struct address_space *mapping = inode->i_mapping;
- loff_t start = inode->i_size, count = size - inode->i_size;
- int err;
-
- err = generic_cont_expand_simple(inode, size);
- if (err)
- goto out;
-
- inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
- mark_inode_dirty(inode);
- if (IS_SYNC(inode))
- err = sync_page_range_nolock(inode, mapping, start, count);
-out:
- return err;
-}
-
int fat_notify_change(struct dentry *dentry, struct iattr *attr)
{
struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
Index: linux-2.6/include/linux/msdos_fs.h
===================================================================
--- linux-2.6.orig/include/linux/msdos_fs.h
+++ linux-2.6/include/linux/msdos_fs.h
@@ -406,6 +406,7 @@ extern int fat_getattr(struct vfsmount *
struct kstat *stat);

/* fat/inode.c */
+extern int fat_cont_expand(struct inode *inode, loff_t size);
extern void fat_attach(struct inode *inode, loff_t i_pos);
extern void fat_detach(struct inode *inode);
extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos);

2006-11-30 22:14:40

by OGAWA Hirofumi

[permalink] [raw]
Subject: Re: [patch 3/3] fs: fix cont vs deadlock patches

Nick Piggin <[email protected]> writes:

> Rework the cont filesystem helpers so that generic_cont_expand does the
> actual work of expanding the file. cont_prepare_write then calls this
> routine if expanding is needed, and retries. Also solves the problem
> where cont_prepare_write would previously hold the target page locked
> while doing not-very-nice things like locking other pages.
>
> Means that zero-length prepare/commit_write pairs are no longer needed
> as an overloaded directive to extend the file, thus cont should operate
> better within the new deadlock-free buffered write code.
>
> Converts fat over to the new cont scheme.
>
> Signed-off-by: Nick Piggin <[email protected]>
>
>
> Index: linux-2.6/fs/buffer.c
> ===================================================================
> --- linux-2.6.orig/fs/buffer.c
> +++ linux-2.6/fs/buffer.c
> @@ -2004,19 +2004,20 @@ int block_read_full_page(struct page *pa
> return 0;
> }
>
> -/* utility function for filesystems that need to do work on expanding
> - * truncates. Uses prepare/commit_write to allow the filesystem to
> - * deal with the hole.
> +/*
> + * Utility function for filesystems that need to do work on expanding
> + * truncates. For moronic filesystems that do not allow holes in file.
> */
> -static int __generic_cont_expand(struct inode *inode, loff_t size,
> - pgoff_t index, unsigned int offset)
> +int generic_cont_expand(struct inode *inode, loff_t size, loff_t *bytes,
> + get_block_t *get_block)
> {
> struct address_space *mapping = inode->i_mapping;
> + unsigned long blocksize = 1 << inode->i_blkbits;
> struct page *page;
> unsigned long limit;
> - int err;
> + int status;
>
> - err = -EFBIG;
> + status = -EFBIG;
> limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
> if (limit != RLIM_INFINITY && size > (loff_t)limit) {
> send_sig(SIGXFSZ, current, 0);
> @@ -2025,146 +2026,83 @@ static int __generic_cont_expand(struct
> if (size > inode->i_sb->s_maxbytes)
> goto out;
>
> - err = -ENOMEM;
> - page = grab_cache_page(mapping, index);
> - if (!page)
> - goto out;
> - err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
> - if (err) {
> - /*
> - * ->prepare_write() may have instantiated a few blocks
> - * outside i_size. Trim these off again.
> - */
> - unlock_page(page);
> - page_cache_release(page);
> - vmtruncate(inode, inode->i_size);
> - goto out;
> - }
> + status = 0;
>
> - err = mapping->a_ops->commit_write(NULL, page, offset, offset);
> + while (*bytes < size) {
> + unsigned int zerofrom;
> + unsigned int zeroto;
> + void *kaddr;
> + pgoff_t pgpos;
> +
> + pgpos = *bytes >> PAGE_CACHE_SHIFT;
> + page = grab_cache_page(mapping, pgpos);
> + if (!page) {
> + status = -ENOMEM;
> + break;
> + }
> + /* we might sleep */
> + if (*bytes >> PAGE_CACHE_SHIFT != pgpos)
> + goto unlock;
>
> - unlock_page(page);
> - page_cache_release(page);
> - if (err > 0)
> - err = 0;
> -out:
> - return err;
> -}
> + zerofrom = *bytes & ~PAGE_CACHE_MASK;
> + if (zerofrom & (blocksize-1))
> + *bytes = (*bytes + blocksize-1) & (blocksize-1);
>
> -int generic_cont_expand(struct inode *inode, loff_t size)
> -{
> - pgoff_t index;
> - unsigned int offset;
> + zeroto = PAGE_CACHE_SIZE;
>
> - offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
> + status = __block_prepare_write(inode, page, zerofrom,
> + zeroto, get_block);
> + if (status)
> + goto unlock;
> + kaddr = kmap_atomic(page, KM_USER0);
> + memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
> + flush_dcache_page(page);
> + kunmap_atomic(kaddr, KM_USER0);
> + status = __block_commit_write(inode, page, zerofrom, zeroto);
>
> - /* ugh. in prepare/commit_write, if from==to==start of block, we
> - ** skip the prepare. make sure we never send an offset for the start
> - ** of a block
> - */
> - if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
> - /* caller must handle this extra byte. */
> - offset++;
> +unlock:
> + unlock_page(page);
> + page_cache_release(page);
> + if (status) {
> + BUG_ON(status == AOP_TRUNCATED_PAGE);
> + break;
> + }
> }
> - index = size >> PAGE_CACHE_SHIFT;
>
> - return __generic_cont_expand(inode, size, index, offset);
> -}
> -
> -int generic_cont_expand_simple(struct inode *inode, loff_t size)
> -{
> - loff_t pos = size - 1;
> - pgoff_t index = pos >> PAGE_CACHE_SHIFT;
> - unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
> + /*
> + * No need to use i_size_read() here, the i_size
> + * cannot change under us because we hold i_mutex.
> + */
> + if (size > inode->i_size) {
> + i_size_write(inode, size);
> + mark_inode_dirty(inode);
> + }
>
> - /* prepare/commit_write can handle even if from==to==start of block. */
> - return __generic_cont_expand(inode, size, index, offset);
> +out:
> + return status;
> }

quick look. Doesn't this break reiserfs? IIRC, the reiserfs is using
it for another reason. I was also working for this, but I lost the
thread of this, sorry.

I found some another users (affs, hfs, hfsplus). Those seem have same
problem, but probably those also can use this...

What do you think?
--
OGAWA Hirofumi <[email protected]>



Signed-off-by: OGAWA Hirofumi <[email protected]>
---

fs/buffer.c | 59 +++++++++++++++++---------------------------
fs/fat/file.c | 2 -
include/linux/buffer_head.h | 1
3 files changed, 24 insertions(+), 38 deletions(-)

diff -puN fs/buffer.c~generic_cont_expand-avoid-zero-size fs/buffer.c
--- linux-2.6/fs/buffer.c~generic_cont_expand-avoid-zero-size 2006-11-13 01:42:01.000000000 +0900
+++ linux-2.6-hirofumi/fs/buffer.c 2006-11-13 02:16:20.000000000 +0900
@@ -2004,18 +2004,24 @@ int block_read_full_page(struct page *pa
return 0;
}

-/* utility function for filesystems that need to do work on expanding
+/*
+ * utility function for filesystems that need to do work on expanding
* truncates. Uses prepare/commit_write to allow the filesystem to
* deal with the hole.
*/
-static int __generic_cont_expand(struct inode *inode, loff_t size,
- pgoff_t index, unsigned int offset)
+int generic_cont_expand(struct inode *inode, loff_t size)
{
struct address_space *mapping = inode->i_mapping;
+ loff_t pos = inode->i_size;
struct page *page;
unsigned long limit;
+ pgoff_t index;
+ unsigned int from, to;
+ void *kaddr;
int err;

+ WARN_ON(pos >= size);
+
err = -EFBIG;
limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
if (limit != RLIM_INFINITY && size > (loff_t)limit) {
@@ -2025,11 +2031,18 @@ static int __generic_cont_expand(struct
if (size > inode->i_sb->s_maxbytes)
goto out;

+ index = (size - 1) >> PAGE_CACHE_SHIFT;
+ to = size - ((loff_t)index << PAGE_CACHE_SHIFT);
+ if (index != (pos >> PAGE_CACHE_SHIFT))
+ from = 0;
+ else
+ from = pos & (PAGE_CACHE_SIZE - 1);
+
err = -ENOMEM;
page = grab_cache_page(mapping, index);
if (!page)
goto out;
- err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
+ err = mapping->a_ops->prepare_write(NULL, page, from, to);
if (err) {
/*
* ->prepare_write() may have instantiated a few blocks
@@ -2041,7 +2054,12 @@ static int __generic_cont_expand(struct
goto out;
}

- err = mapping->a_ops->commit_write(NULL, page, offset, offset);
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + from, 0, to - from);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ err = mapping->a_ops->commit_write(NULL, page, from, to);

unlock_page(page);
page_cache_release(page);
@@ -2051,36 +2069,6 @@ out:
return err;
}

-int generic_cont_expand(struct inode *inode, loff_t size)
-{
- pgoff_t index;
- unsigned int offset;
-
- offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
-
- /* ugh. in prepare/commit_write, if from==to==start of block, we
- ** skip the prepare. make sure we never send an offset for the start
- ** of a block
- */
- if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
- /* caller must handle this extra byte. */
- offset++;
- }
- index = size >> PAGE_CACHE_SHIFT;
-
- return __generic_cont_expand(inode, size, index, offset);
-}
-
-int generic_cont_expand_simple(struct inode *inode, loff_t size)
-{
- loff_t pos = size - 1;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
-
- /* prepare/commit_write can handle even if from==to==start of block. */
- return __generic_cont_expand(inode, size, index, offset);
-}
-
/*
* For moronic filesystems that do not allow holes in file.
* We may have to extend the file.
@@ -3032,7 +3020,6 @@ EXPORT_SYMBOL(fsync_bdev);
EXPORT_SYMBOL(generic_block_bmap);
EXPORT_SYMBOL(generic_commit_write);
EXPORT_SYMBOL(generic_cont_expand);
-EXPORT_SYMBOL(generic_cont_expand_simple);
EXPORT_SYMBOL(init_buffer);
EXPORT_SYMBOL(invalidate_bdev);
EXPORT_SYMBOL(ll_rw_block);
diff -puN include/linux/buffer_head.h~generic_cont_expand-avoid-zero-size include/linux/buffer_head.h
--- linux-2.6/include/linux/buffer_head.h~generic_cont_expand-avoid-zero-size 2006-11-13 01:42:01.000000000 +0900
+++ linux-2.6-hirofumi/include/linux/buffer_head.h 2006-11-13 01:42:01.000000000 +0900
@@ -202,7 +202,6 @@ int block_prepare_write(struct page*, un
int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*,
loff_t *);
int generic_cont_expand(struct inode *inode, loff_t size);
-int generic_cont_expand_simple(struct inode *inode, loff_t size);
int block_commit_write(struct page *page, unsigned from, unsigned to);
void block_sync_page(struct page *);
sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
diff -puN fs/fat/file.c~generic_cont_expand-avoid-zero-size fs/fat/file.c
--- linux-2.6/fs/fat/file.c~generic_cont_expand-avoid-zero-size 2006-11-13 01:42:01.000000000 +0900
+++ linux-2.6-hirofumi/fs/fat/file.c 2006-11-13 01:42:01.000000000 +0900
@@ -143,7 +143,7 @@ static int fat_cont_expand(struct inode
loff_t start = inode->i_size, count = size - inode->i_size;
int err;

- err = generic_cont_expand_simple(inode, size);
+ err = generic_cont_expand(inode, size);
if (err)
goto out;

_

2006-12-01 00:27:53

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 3/3] fs: fix cont vs deadlock patches

On Fri, Dec 01, 2006 at 07:14:28AM +0900, OGAWA Hirofumi wrote:
>
> quick look. Doesn't this break reiserfs? IIRC, the reiserfs is using
> it for another reason. I was also working for this, but I lost the
> thread of this, sorry.

Yes I think it will break reiserfs, so I just have to have a look
at converting it. Shouldn't take too long.

>
> I found some another users (affs, hfs, hfsplus). Those seem have same
> problem, but probably those also can use this...
>
> What do you think?

Well I guess this is your code, so it is up to you ;)

I would be happy if you come up with a quick fix, I'm just trying to
stamp out a few big bugs in mm. However I did prefer my way of moving
all the exapand code into generic_cont_expand, out of prepare_write, and
avoiding holding the target page locked while we're doing all the expand
work (strictly, you might be able to get away with this, but it is
fragile and ugly).

AFAIKS, the only reason to use prepare_write is to avoid passing the
get_block into generic_cont_expand?


Thanks,
Nick

2006-12-01 01:11:24

by OGAWA Hirofumi

[permalink] [raw]
Subject: Re: [patch 3/3] fs: fix cont vs deadlock patches

Nick Piggin <[email protected]> writes:

> I would be happy if you come up with a quick fix, I'm just trying to
> stamp out a few big bugs in mm. However I did prefer my way of moving
> all the exapand code into generic_cont_expand, out of prepare_write, and
> avoiding holding the target page locked while we're doing all the expand
> work (strictly, you might be able to get away with this, but it is
> fragile and ugly).
>
> AFAIKS, the only reason to use prepare_write is to avoid passing the
> get_block into generic_cont_expand?

IIRC, because generic_cont_expand is designed as really generic. It
can also use for non moronic filesystem.

In the case of reiserfs, it ->prepare_write might be necessary.
--
OGAWA Hirofumi <[email protected]>

2006-12-01 02:03:57

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 3/3] fs: fix cont vs deadlock patches

On Fri, Dec 01, 2006 at 10:11:14AM +0900, OGAWA Hirofumi wrote:
> Nick Piggin <[email protected]> writes:
>
> > I would be happy if you come up with a quick fix, I'm just trying to
> > stamp out a few big bugs in mm. However I did prefer my way of moving
> > all the exapand code into generic_cont_expand, out of prepare_write, and
> > avoiding holding the target page locked while we're doing all the expand
> > work (strictly, you might be able to get away with this, but it is
> > fragile and ugly).
> >
> > AFAIKS, the only reason to use prepare_write is to avoid passing the
> > get_block into generic_cont_expand?
>
> IIRC, because generic_cont_expand is designed as really generic. It
> can also use for non moronic filesystem.
>
> In the case of reiserfs, it ->prepare_write might be necessary.

Yes I see :(

Well, maybe we should use your alternate patch, then.
I have a few questions on it.

2006-12-01 02:09:16

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 3/3] fs: fix cont vs deadlock patches

On Fri, Dec 01, 2006 at 07:14:28AM +0900, OGAWA Hirofumi wrote:
>
> quick look. Doesn't this break reiserfs? IIRC, the reiserfs is using
> it for another reason. I was also working for this, but I lost the
> thread of this, sorry.
>
> I found some another users (affs, hfs, hfsplus). Those seem have same
> problem, but probably those also can use this...
>
> What do you think?
> --
> OGAWA Hirofumi <[email protected]>
>
>
>
> Signed-off-by: OGAWA Hirofumi <[email protected]>
> ---
>
> fs/buffer.c | 59 +++++++++++++++++---------------------------
> fs/fat/file.c | 2 -
> include/linux/buffer_head.h | 1
> 3 files changed, 24 insertions(+), 38 deletions(-)
>
> diff -puN fs/buffer.c~generic_cont_expand-avoid-zero-size fs/buffer.c
> --- linux-2.6/fs/buffer.c~generic_cont_expand-avoid-zero-size 2006-11-13 01:42:01.000000000 +0900
> +++ linux-2.6-hirofumi/fs/buffer.c 2006-11-13 02:16:20.000000000 +0900
> @@ -2004,18 +2004,24 @@ int block_read_full_page(struct page *pa
> return 0;
> }
>
> -/* utility function for filesystems that need to do work on expanding
> +/*
> + * utility function for filesystems that need to do work on expanding
> * truncates. Uses prepare/commit_write to allow the filesystem to
> * deal with the hole.
> */
> -static int __generic_cont_expand(struct inode *inode, loff_t size,
> - pgoff_t index, unsigned int offset)
> +int generic_cont_expand(struct inode *inode, loff_t size)
> {
> struct address_space *mapping = inode->i_mapping;
> + loff_t pos = inode->i_size;
> struct page *page;
> unsigned long limit;
> + pgoff_t index;
> + unsigned int from, to;
> + void *kaddr;
> int err;
>
> + WARN_ON(pos >= size);
> +
> err = -EFBIG;
> limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
> if (limit != RLIM_INFINITY && size > (loff_t)limit) {
> @@ -2025,11 +2031,18 @@ static int __generic_cont_expand(struct
> if (size > inode->i_sb->s_maxbytes)
> goto out;
>
> + index = (size - 1) >> PAGE_CACHE_SHIFT;
> + to = size - ((loff_t)index << PAGE_CACHE_SHIFT);
> + if (index != (pos >> PAGE_CACHE_SHIFT))
> + from = 0;
> + else
> + from = pos & (PAGE_CACHE_SIZE - 1);
> +
> err = -ENOMEM;
> page = grab_cache_page(mapping, index);
> if (!page)
> goto out;
> - err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
> + err = mapping->a_ops->prepare_write(NULL, page, from, to);
> if (err) {
> /*
> * ->prepare_write() may have instantiated a few blocks
> @@ -2041,7 +2054,12 @@ static int __generic_cont_expand(struct
> goto out;
> }
>
> - err = mapping->a_ops->commit_write(NULL, page, offset, offset);
> + kaddr = kmap_atomic(page, KM_USER0);
> + memset(kaddr + from, 0, to - from);
> + flush_dcache_page(page);
> + kunmap_atomic(kaddr, KM_USER0);
> +
> + err = mapping->a_ops->commit_write(NULL, page, from, to);

So basically this is changing from having prepare_write do all the
zeroing, to zeroing the last page in generic_cont_expand, so that
we don't have to pass a zero-length to prepare_write?

2006-12-01 03:41:33

by OGAWA Hirofumi

[permalink] [raw]
Subject: Re: [patch 3/3] fs: fix cont vs deadlock patches

Nick Piggin <[email protected]> writes:

> On Fri, Dec 01, 2006 at 07:14:28AM +0900, OGAWA Hirofumi wrote:
>>
>> quick look. Doesn't this break reiserfs? IIRC, the reiserfs is using
>> it for another reason. I was also working for this, but I lost the
>> thread of this, sorry.
>>
>> I found some another users (affs, hfs, hfsplus). Those seem have same
>> problem, but probably those also can use this...
>>
>> What do you think?
>> --
>> OGAWA Hirofumi <[email protected]>
>>
>>
>>
>> Signed-off-by: OGAWA Hirofumi <[email protected]>
>> ---
>>
>> fs/buffer.c | 59 +++++++++++++++++---------------------------
>> fs/fat/file.c | 2 -
>> include/linux/buffer_head.h | 1
>> 3 files changed, 24 insertions(+), 38 deletions(-)
>>
>> diff -puN fs/buffer.c~generic_cont_expand-avoid-zero-size fs/buffer.c
>> --- linux-2.6/fs/buffer.c~generic_cont_expand-avoid-zero-size 2006-11-13 01:42:01.000000000 +0900
>> +++ linux-2.6-hirofumi/fs/buffer.c 2006-11-13 02:16:20.000000000 +0900
>> @@ -2004,18 +2004,24 @@ int block_read_full_page(struct page *pa
>> return 0;
>> }
>>
>> -/* utility function for filesystems that need to do work on expanding
>> +/*
>> + * utility function for filesystems that need to do work on expanding
>> * truncates. Uses prepare/commit_write to allow the filesystem to
>> * deal with the hole.
>> */
>> -static int __generic_cont_expand(struct inode *inode, loff_t size,
>> - pgoff_t index, unsigned int offset)
>> +int generic_cont_expand(struct inode *inode, loff_t size)
>> {
>> struct address_space *mapping = inode->i_mapping;
>> + loff_t pos = inode->i_size;
>> struct page *page;
>> unsigned long limit;
>> + pgoff_t index;
>> + unsigned int from, to;
>> + void *kaddr;
>> int err;
>>
>> + WARN_ON(pos >= size);
>> +
>> err = -EFBIG;
>> limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
>> if (limit != RLIM_INFINITY && size > (loff_t)limit) {
>> @@ -2025,11 +2031,18 @@ static int __generic_cont_expand(struct
>> if (size > inode->i_sb->s_maxbytes)
>> goto out;
>>
>> + index = (size - 1) >> PAGE_CACHE_SHIFT;
>> + to = size - ((loff_t)index << PAGE_CACHE_SHIFT);
>> + if (index != (pos >> PAGE_CACHE_SHIFT))
>> + from = 0;
>> + else
>> + from = pos & (PAGE_CACHE_SIZE - 1);
>> +
>> err = -ENOMEM;
>> page = grab_cache_page(mapping, index);
>> if (!page)
>> goto out;
>> - err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
>> + err = mapping->a_ops->prepare_write(NULL, page, from, to);
>> if (err) {
>> /*
>> * ->prepare_write() may have instantiated a few blocks
>> @@ -2041,7 +2054,12 @@ static int __generic_cont_expand(struct
>> goto out;
>> }
>>
>> - err = mapping->a_ops->commit_write(NULL, page, offset, offset);
>> + kaddr = kmap_atomic(page, KM_USER0);
>> + memset(kaddr + from, 0, to - from);
>> + flush_dcache_page(page);
>> + kunmap_atomic(kaddr, KM_USER0);
>> +
>> + err = mapping->a_ops->commit_write(NULL, page, from, to);
>
> So basically this is changing from having prepare_write do all the
> zeroing, to zeroing the last page in generic_cont_expand, so that
> we don't have to pass a zero-length to prepare_write?

Yes, this patch doesn't pass zero-length to prepare_write. However,
I'm not checking this patch is ok for reiserfs...
--
OGAWA Hirofumi <[email protected]>

2006-12-01 03:47:29

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 3/3] fs: fix cont vs deadlock patches

On Fri, Dec 01, 2006 at 12:41:25PM +0900, OGAWA Hirofumi wrote:
> Nick Piggin <[email protected]> writes:
> >
> > So basically this is changing from having prepare_write do all the
> > zeroing, to zeroing the last page in generic_cont_expand, so that
> > we don't have to pass a zero-length to prepare_write?
>
> Yes, this patch doesn't pass zero-length to prepare_write. However,
> I'm not checking this patch is ok for reiserfs...

OK, I'm doing some testing with it now...

2006-12-01 05:08:55

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 3/3] fs: fix cont vs deadlock patches

On Fri, Dec 01, 2006 at 12:41:25PM +0900, OGAWA Hirofumi wrote:
>
> Yes, this patch doesn't pass zero-length to prepare_write. However,
> I'm not checking this patch is ok for reiserfs...

OK, vfat wasn't working correctly for me -- I needed the following patch:

Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c 2006-12-01 15:31:22.000000000 +1100
+++ linux-2.6/fs/buffer.c 2006-12-01 16:02:23.000000000 +1100
@@ -2102,6 +2102,7 @@
*bytes |= (blocksize-1);
(*bytes)++;
}
+
status = __block_prepare_write(inode, new_page, zerofrom,
PAGE_CACHE_SIZE, get_block);
if (status)
@@ -2110,7 +2111,7 @@
memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
flush_dcache_page(new_page);
kunmap_atomic(kaddr, KM_USER0);
- generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
+ __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
unlock_page(new_page);
page_cache_release(new_page);
}
@@ -2142,6 +2143,7 @@
kunmap_atomic(kaddr, KM_USER0);
__block_commit_write(inode, page, zerofrom, offset);
}
+
return 0;
out1:
ClearPageUptodate(page);
Index: linux-2.6/fs/fat/inode.c
===================================================================
--- linux-2.6.orig/fs/fat/inode.c 2006-12-01 15:31:22.000000000 +1100
+++ linux-2.6/fs/fat/inode.c 2006-12-01 16:02:08.000000000 +1100
@@ -151,7 +151,8 @@
{
struct inode *inode = page->mapping->host;
int err;
- if (to - from > 0)
+
+ if (to - from == 0)
return 0;

err = generic_commit_write(file, page, from, to);

And reiserfs has some creative uses of generic_cont_expand, which I
think I've fixed in the following way. I'll send out the full patch
if you (and others) agree with the changes.


Index: linux-2.6/fs/reiserfs/file.c
===================================================================
--- linux-2.6.orig/fs/reiserfs/file.c 2006-12-01 16:01:17.000000000 +1100
+++ linux-2.6/fs/reiserfs/file.c 2006-12-01 16:03:34.000000000 +1100
@@ -892,6 +892,34 @@
return retval;
}

+static int reiserfs_convert_tail(struct inode *inode, loff_t offset)
+{
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t index;
+ struct page *page;
+ int err = -ENOMEM;
+
+ index = offset >> PAGE_CACHE_SHIFT;
+ page = grab_cache_page(mapping, index);
+ if (!page)
+ goto out;
+
+ offset = offset & ~PAGE_CACHE_MASK;
+ WARN_ON(!offset); /* shouldn't pass zero length to prepare/commit */
+
+ err = mapping->a_ops->prepare_write(NULL, page, 0, offset);
+ if (err)
+ goto out_unlock;
+
+ err = mapping->a_ops->commit_write(NULL, page, 0, offset);
+
+out_unlock:
+ unlock_page(page);
+ page_cache_release(page);
+out:
+ return err;
+}
+
/* Look if passed writing region is going to touch file's tail
(if it is present). And if it is, convert the tail to unformatted node */
static int reiserfs_check_for_tail_and_convert(struct inode *inode, /* inode to deal with */
@@ -937,7 +965,8 @@
le_key_k_offset(get_inode_item_key_version(inode),
&(ih->ih_key));
pathrelse(&path);
- res = generic_cont_expand(inode, cont_expand_offset);
+
+ res = reiserfs_convert_tail(inode, cont_expand_offset);
} else
pathrelse(&path);

2006-12-01 07:21:10

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch 3/3] fs: fix cont vs deadlock patches

On Fri, 1 Dec 2006 06:08:52 +0100
Nick Piggin <[email protected]> wrote:

> On Fri, Dec 01, 2006 at 12:41:25PM +0900, OGAWA Hirofumi wrote:
> >
> > Yes, this patch doesn't pass zero-length to prepare_write. However,
> > I'm not checking this patch is ok for reiserfs...
>
> OK, vfat wasn't working correctly for me -- I needed the following patch:

Now I'm confused. What relationship does this patch have to the below?

revert-generic_file_buffered_write-handle-zero-length-iovec-segments.patch
revert-generic_file_buffered_write-deadlock-on-vectored-write.patch
generic_file_buffered_write-cleanup.patch
mm-only-mm-debug-write-deadlocks.patch
mm-fix-pagecache-write-deadlocks.patch
mm-fix-pagecache-write-deadlocks-comment.patch
mm-fix-pagecache-write-deadlocks-xip.patch
mm-fix-pagecache-write-deadlocks-mm-pagecache-write-deadlocks-efault-fix.patch
mm-fix-pagecache-write-deadlocks-zerolength-fix.patch
mm-fix-pagecache-write-deadlocks-stale-holes-fix.patch
fs-prepare_write-fixes.patch
fs-prepare_write-fixes-fuse-fix.patch
fs-prepare_write-fixes-jffs-fix.patch
fs-prepare_write-fixes-fat-fix.patch
fs-fix-cont-vs-deadlock-patches.patch


> Index: linux-2.6/fs/buffer.c
> ===================================================================
> --- linux-2.6.orig/fs/buffer.c 2006-12-01 15:31:22.000000000 +1100
> +++ linux-2.6/fs/buffer.c 2006-12-01 16:02:23.000000000 +1100
> @@ -2102,6 +2102,7 @@

Please always use `diff -p'

> *bytes |= (blocksize-1);
> (*bytes)++;
> }
> +
> status = __block_prepare_write(inode, new_page, zerofrom,
> PAGE_CACHE_SIZE, get_block);
> if (status)
> @@ -2110,7 +2111,7 @@
> memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
> flush_dcache_page(new_page);
> kunmap_atomic(kaddr, KM_USER0);
> - generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
> + __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);

Whatever function this is doesn't need to update i_size?


2006-12-01 07:53:45

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 3/3] fs: fix cont vs deadlock patches

On Thu, Nov 30, 2006 at 11:21:02PM -0800, Andrew Morton wrote:
> On Fri, 1 Dec 2006 06:08:52 +0100
> Nick Piggin <[email protected]> wrote:
>
> > On Fri, Dec 01, 2006 at 12:41:25PM +0900, OGAWA Hirofumi wrote:
> > >
> > > Yes, this patch doesn't pass zero-length to prepare_write. However,
> > > I'm not checking this patch is ok for reiserfs...
> >
> > OK, vfat wasn't working correctly for me -- I needed the following patch:
>
> Now I'm confused. What relationship does this patch have to the below?

It is on top of OGAWA Hirofumi's patch, which itself is a replacement
for fs-fix-cont-vs-deadlock-patches.patch. I'll ensure you get the
right one.

> revert-generic_file_buffered_write-handle-zero-length-iovec-segments.patch
> revert-generic_file_buffered_write-deadlock-on-vectored-write.patch
> generic_file_buffered_write-cleanup.patch
> mm-only-mm-debug-write-deadlocks.patch
> mm-fix-pagecache-write-deadlocks.patch
> mm-fix-pagecache-write-deadlocks-comment.patch
> mm-fix-pagecache-write-deadlocks-xip.patch
> mm-fix-pagecache-write-deadlocks-mm-pagecache-write-deadlocks-efault-fix.patch
> mm-fix-pagecache-write-deadlocks-zerolength-fix.patch
> mm-fix-pagecache-write-deadlocks-stale-holes-fix.patch
> fs-prepare_write-fixes.patch
> fs-prepare_write-fixes-fuse-fix.patch
> fs-prepare_write-fixes-jffs-fix.patch
> fs-prepare_write-fixes-fat-fix.patch
> fs-fix-cont-vs-deadlock-patches.patch
>
>
> > Index: linux-2.6/fs/buffer.c
> > ===================================================================
> > --- linux-2.6.orig/fs/buffer.c 2006-12-01 15:31:22.000000000 +1100
> > +++ linux-2.6/fs/buffer.c 2006-12-01 16:02:23.000000000 +1100
> > @@ -2102,6 +2102,7 @@
>
> Please always use `diff -p'

Dang, yes its much better.

> > *bytes |= (blocksize-1);
> > (*bytes)++;
> > }
> > +
> > status = __block_prepare_write(inode, new_page, zerofrom,
> > PAGE_CACHE_SIZE, get_block);
> > if (status)
> > @@ -2110,7 +2111,7 @@
> > memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
> > flush_dcache_page(new_page);
> > kunmap_atomic(kaddr, KM_USER0);
> > - generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
> > + __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
>
> Whatever function this is doesn't need to update i_size?

Yes, it is the code in cont_prepare_write that is expanding a hole
at the end of file.

We can do this now because fat_commit_write is now changed to call
generic_commit_write in the case of a non-zero length.

I think it is an improvement because now the file will not get
arbitrarily extended in the case of a write failure somewhere down
the track.

2006-12-01 14:51:15

by OGAWA Hirofumi

[permalink] [raw]
Subject: Re: [patch 3/3] fs: fix cont vs deadlock patches

Nick Piggin <[email protected]> writes:

>> > status = __block_prepare_write(inode, new_page, zerofrom,
>> > PAGE_CACHE_SIZE, get_block);
>> > if (status)
>> > @@ -2110,7 +2111,7 @@
>> > memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
>> > flush_dcache_page(new_page);
>> > kunmap_atomic(kaddr, KM_USER0);
>> > - generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
>> > + __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
>>
>> Whatever function this is doesn't need to update i_size?
>
> Yes, it is the code in cont_prepare_write that is expanding a hole
> at the end of file.
>
> We can do this now because fat_commit_write is now changed to call
> generic_commit_write in the case of a non-zero length.
>
> I think it is an improvement because now the file will not get
> arbitrarily extended in the case of a write failure somewhere down
> the track.

Ah, unfortunately we can't this. If we don't update ->i_size before
page_cache_release, pdflush will think these pages is outside ->i_size
and just clean the page without writing it.
--
OGAWA Hirofumi <[email protected]>

2006-12-01 15:47:32

by OGAWA Hirofumi

[permalink] [raw]
Subject: Re: [patch 3/3] fs: fix cont vs deadlock patches

OGAWA Hirofumi <[email protected]> writes:

> Ah, unfortunately we can't this. If we don't update ->i_size before
> page_cache_release, pdflush will think these pages is outside ->i_size
> and just clean the page without writing it.

Ugh, of course, s/page_cache_release/unlock_page/
--
OGAWA Hirofumi <[email protected]>

2006-12-02 00:37:28

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 3/3] fs: fix cont vs deadlock patches

OGAWA Hirofumi wrote:
> Nick Piggin <[email protected]> writes:
>
>
>>>> status = __block_prepare_write(inode, new_page, zerofrom,
>>>> PAGE_CACHE_SIZE, get_block);
>>>> if (status)
>>>>@@ -2110,7 +2111,7 @@
>>>> memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
>>>> flush_dcache_page(new_page);
>>>> kunmap_atomic(kaddr, KM_USER0);
>>>>- generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
>>>>+ __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
>>>
>>>Whatever function this is doesn't need to update i_size?
>>
>>Yes, it is the code in cont_prepare_write that is expanding a hole
>>at the end of file.
>>
>>We can do this now because fat_commit_write is now changed to call
>>generic_commit_write in the case of a non-zero length.
>>
>>I think it is an improvement because now the file will not get
>>arbitrarily extended in the case of a write failure somewhere down
>>the track.
>
>
> Ah, unfortunately we can't this. If we don't update ->i_size before
> page_cache_release, pdflush will think these pages is outside ->i_size
> and just clean the page without writing it.

I see. I guess you need to synchronise your writepage versus this
extention in order to handle it properly then. I won't bother with
that though: it won't be worse than it was before.

Thanks for review, do you agree with the other hunks?

--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com

2006-12-02 07:28:37

by Nick Piggin

[permalink] [raw]
Subject: [new patch 3/3] fs: fix cont vs deadlock patches

> I see. I guess you need to synchronise your writepage versus this
> extention in order to handle it properly then. I won't bother with
> that though: it won't be worse than it was before.
>
> Thanks for review, do you agree with the other hunks?

Well, Andrew's got the rest of the patches in his tree, so I'll send
what we've got for now. Has had some testing on both reiserfs and
fat. Doesn't look like the other filesystems using cont_prepare_write
will have any problems...

Andrew, please apply this patch as a replacement for the fat-fix
patch in your rollup (this patch includes the same fix, and is a
more logical change unit I think).
--

Stop overloading zero length prepare/commit_write to request a
file extend operation by the "cont" buffer code. Instead, have
generic_cont_expand perform a zeroing operation on the last
page, and cont_prepare_write naturally zeroes any previous
pages required.

Signed-off-by: OGAWA Hirofumi <[email protected]>

Reiserfs was trying to "extend" a file to something smaller than
it already is with generic_cont_expand. This broke the above fix.
Open code the prepare/ commit pair instead... maybe the code would
be cleaner if reiserfs just did the operation explicitly (call
get_block or whatever helper is used to unpack the tail)?

Signed-off-by: Nick Piggin <[email protected]>


Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c
+++ linux-2.6/fs/buffer.c
@@ -2004,18 +2004,24 @@ int block_read_full_page(struct page *pa
return 0;
}

-/* utility function for filesystems that need to do work on expanding
+/*
+ * utility function for filesystems that need to do work on expanding
* truncates. Uses prepare/commit_write to allow the filesystem to
* deal with the hole.
*/
-static int __generic_cont_expand(struct inode *inode, loff_t size,
- pgoff_t index, unsigned int offset)
+int generic_cont_expand(struct inode *inode, loff_t size)
{
struct address_space *mapping = inode->i_mapping;
+ loff_t pos = inode->i_size;
struct page *page;
unsigned long limit;
+ pgoff_t index;
+ unsigned int from, to;
+ void *kaddr;
int err;

+ WARN_ON(pos >= size);
+
err = -EFBIG;
limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
if (limit != RLIM_INFINITY && size > (loff_t)limit) {
@@ -2025,11 +2031,18 @@ static int __generic_cont_expand(struct
if (size > inode->i_sb->s_maxbytes)
goto out;

+ index = (size - 1) >> PAGE_CACHE_SHIFT;
+ to = size - ((loff_t)index << PAGE_CACHE_SHIFT);
+ if (index != (pos >> PAGE_CACHE_SHIFT))
+ from = 0;
+ else
+ from = pos & (PAGE_CACHE_SIZE - 1);
+
err = -ENOMEM;
page = grab_cache_page(mapping, index);
if (!page)
goto out;
- err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
+ err = mapping->a_ops->prepare_write(NULL, page, from, to);
if (err) {
/*
* ->prepare_write() may have instantiated a few blocks
@@ -2041,7 +2054,12 @@ static int __generic_cont_expand(struct
goto out;
}

- err = mapping->a_ops->commit_write(NULL, page, offset, offset);
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + from, 0, to - from);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ err = mapping->a_ops->commit_write(NULL, page, from, to);

unlock_page(page);
page_cache_release(page);
@@ -2051,36 +2069,6 @@ out:
return err;
}

-int generic_cont_expand(struct inode *inode, loff_t size)
-{
- pgoff_t index;
- unsigned int offset;
-
- offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
-
- /* ugh. in prepare/commit_write, if from==to==start of block, we
- ** skip the prepare. make sure we never send an offset for the start
- ** of a block
- */
- if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
- /* caller must handle this extra byte. */
- offset++;
- }
- index = size >> PAGE_CACHE_SHIFT;
-
- return __generic_cont_expand(inode, size, index, offset);
-}
-
-int generic_cont_expand_simple(struct inode *inode, loff_t size)
-{
- loff_t pos = size - 1;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
-
- /* prepare/commit_write can handle even if from==to==start of block. */
- return __generic_cont_expand(inode, size, index, offset);
-}
-
/*
* For moronic filesystems that do not allow holes in file.
* We may have to extend the file.
@@ -3015,7 +3003,6 @@ EXPORT_SYMBOL(fsync_bdev);
EXPORT_SYMBOL(generic_block_bmap);
EXPORT_SYMBOL(generic_commit_write);
EXPORT_SYMBOL(generic_cont_expand);
-EXPORT_SYMBOL(generic_cont_expand_simple);
EXPORT_SYMBOL(init_buffer);
EXPORT_SYMBOL(invalidate_bdev);
EXPORT_SYMBOL(ll_rw_block);
Index: linux-2.6/include/linux/buffer_head.h
===================================================================
--- linux-2.6.orig/include/linux/buffer_head.h
+++ linux-2.6/include/linux/buffer_head.h
@@ -203,7 +203,6 @@ int block_prepare_write(struct page*, un
int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*,
loff_t *);
int generic_cont_expand(struct inode *inode, loff_t size);
-int generic_cont_expand_simple(struct inode *inode, loff_t size);
int block_commit_write(struct page *page, unsigned from, unsigned to);
void block_sync_page(struct page *);
sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
Index: linux-2.6/fs/fat/file.c
===================================================================
--- linux-2.6.orig/fs/fat/file.c
+++ linux-2.6/fs/fat/file.c
@@ -143,7 +143,7 @@ static int fat_cont_expand(struct inode
loff_t start = inode->i_size, count = size - inode->i_size;
int err;

- err = generic_cont_expand_simple(inode, size);
+ err = generic_cont_expand(inode, size);
if (err)
goto out;

Index: linux-2.6/fs/fat/inode.c
===================================================================
--- linux-2.6.orig/fs/fat/inode.c
+++ linux-2.6/fs/fat/inode.c
@@ -151,7 +151,8 @@ static int fat_commit_write(struct file
{
struct inode *inode = page->mapping->host;
int err;
- if (to - from > 0)
+
+ if (to - from == 0)
return 0;

err = generic_commit_write(file, page, from, to);
Index: linux-2.6/fs/reiserfs/file.c
===================================================================
--- linux-2.6.orig/fs/reiserfs/file.c
+++ linux-2.6/fs/reiserfs/file.c
@@ -892,6 +892,37 @@ static int reiserfs_submit_file_region_f
return retval;
}

+/* To not overcomplicate matters, we just call prepare/commit_wrie
+ which will in turn call other stuff and finally will boil down to
+ reiserfs_get_block() that would do necessary conversion. */
+static int reiserfs_convert_tail(struct inode *inode, loff_t offset)
+{
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t index;
+ struct page *page;
+ int err = -ENOMEM;
+
+ index = offset >> PAGE_CACHE_SHIFT;
+ page = grab_cache_page(mapping, index);
+ if (!page)
+ goto out;
+
+ offset = offset & ~PAGE_CACHE_MASK;
+ WARN_ON(!offset); /* shouldn't pass zero length to prepare/commit */
+
+ err = mapping->a_ops->prepare_write(NULL, page, 0, offset);
+ if (err)
+ goto out_unlock;
+
+ err = mapping->a_ops->commit_write(NULL, page, 0, offset);
+
+out_unlock:
+ unlock_page(page);
+ page_cache_release(page);
+out:
+ return err;
+}
+
/* Look if passed writing region is going to touch file's tail
(if it is present). And if it is, convert the tail to unformatted node */
static int reiserfs_check_for_tail_and_convert(struct inode *inode, /* inode to deal with */
@@ -930,14 +961,12 @@ static int reiserfs_check_for_tail_and_c
if (is_direct_le_ih(ih)) {
/* Ok, closest item is file tail (tails are stored in "direct"
* items), so we need to unpack it. */
- /* To not overcomplicate matters, we just call generic_cont_expand
- which will in turn call other stuff and finally will boil down to
- reiserfs_get_block() that would do necessary conversion. */
cont_expand_offset =
le_key_k_offset(get_inode_item_key_version(inode),
&(ih->ih_key));
pathrelse(&path);
- res = generic_cont_expand(inode, cont_expand_offset);
+
+ res = reiserfs_convert_tail(inode, cont_expand_offset);
} else
pathrelse(&path);

2006-12-02 09:43:17

by OGAWA Hirofumi

[permalink] [raw]
Subject: Re: [new patch 3/3] fs: fix cont vs deadlock patches

Nick Piggin <[email protected]> writes:

>> I see. I guess you need to synchronise your writepage versus this
>> extention in order to handle it properly then. I won't bother with
>> that though: it won't be worse than it was before.
>>
>> Thanks for review, do you agree with the other hunks?
>
> Well, Andrew's got the rest of the patches in his tree, so I'll send
> what we've got for now. Has had some testing on both reiserfs and
> fat. Doesn't look like the other filesystems using cont_prepare_write
> will have any problems...
>
> Andrew, please apply this patch as a replacement for the fat-fix
> patch in your rollup (this patch includes the same fix, and is a
> more logical change unit I think).

I'm confused. I couldn't track what is final patchset. Anyway, I'll
see and test your final patchset in -mm.

Thanks.
--
OGAWA Hirofumi <[email protected]>