Hi Linus,
The following patch (against clean test13-pre4) removes the races in
shmem_unuse. I changed inode.c to not lock the inode if there is no
write_inode function. So I can grab the inode while holding the
spinlock.
It also optimises the shmem_ftruncate behaviour.
BTW: The generic swapoff path itself has still races if a process is
paging in a page which is just freed on swap by try_to_unuse. It gives
'VM: bad swap entries' and worse. But this is not shmem
specific. Marcelo would you like to look into this?
Greetings
Christoph
--- 4-13-4/fs/inode.c Sun Dec 17 12:54:00 2000
+++ c/fs/inode.c Wed Dec 27 11:02:59 2000
@@ -168,13 +168,6 @@
__wait_on_inode(inode);
}
-
-static inline void write_inode(struct inode *inode, int sync)
-{
- if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->write_inode)
- inode->i_sb->s_op->write_inode(inode, sync);
-}
-
static inline void __iget(struct inode * inode)
{
if (atomic_read(&inode->i_count)) {
@@ -202,16 +195,20 @@
list_add(&inode->i_list, atomic_read(&inode->i_count)
? &inode_in_use
: &inode_unused);
- /* Set I_LOCK, reset I_DIRTY */
- inode->i_state |= I_LOCK;
- inode->i_state &= ~I_DIRTY;
- spin_unlock(&inode_lock);
+ if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->write_inode) {
+ /* Set I_LOCK, reset I_DIRTY */
+ inode->i_state |= I_LOCK;
+ inode->i_state &= ~I_DIRTY;
+ spin_unlock(&inode_lock);
- write_inode(inode, sync);
+ inode->i_sb->s_op->write_inode(inode, sync);
- spin_lock(&inode_lock);
- inode->i_state &= ~I_LOCK;
- wake_up(&inode->i_wait);
+ spin_lock(&inode_lock);
+ inode->i_state &= ~I_LOCK;
+ wake_up(&inode->i_wait);
+ return;
+ }
+ inode->i_state &= ~I_DIRTY;
}
}
diff -uNr 4-13-4/include/linux/shmem_fs.h c/include/linux/shmem_fs.h
--- 4-13-4/include/linux/shmem_fs.h Fri Dec 22 10:05:38 2000
+++ c/include/linux/shmem_fs.h Tue Dec 26 18:52:29 2000
@@ -19,6 +19,7 @@
struct shmem_inode_info {
spinlock_t lock;
+ unsigned long max_index;
swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */
swp_entry_t **i_indirect; /* doubly indirect blocks */
unsigned long swapped;
diff -uNr 4-13-4/mm/shmem.c c/mm/shmem.c
--- 4-13-4/mm/shmem.c Fri Dec 22 10:05:38 2000
+++ c/mm/shmem.c Tue Dec 26 20:00:32 2000
@@ -51,11 +51,16 @@
static swp_entry_t * shmem_swp_entry (struct shmem_inode_info *info, unsigned long index)
{
+ unsigned long offset;
+
if (index < SHMEM_NR_DIRECT)
return info->i_direct+index;
index -= SHMEM_NR_DIRECT;
- if (index >= ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
+ offset = index % ENTRIES_PER_PAGE;
+ index /= ENTRIES_PER_PAGE;
+
+ if (index >= ENTRIES_PER_PAGE)
return NULL;
if (!info->i_indirect) {
@@ -63,13 +68,13 @@
if (!info->i_indirect)
return NULL;
}
- if(!(info->i_indirect[index/ENTRIES_PER_PAGE])) {
- info->i_indirect[index/ENTRIES_PER_PAGE] = (swp_entry_t *) get_zeroed_page(GFP_USER);
- if (!info->i_indirect[index/ENTRIES_PER_PAGE])
+ if(!(info->i_indirect[index])) {
+ info->i_indirect[index] = (swp_entry_t *) get_zeroed_page(GFP_USER);
+ if (!info->i_indirect[index])
return NULL;
}
- return info->i_indirect[index/ENTRIES_PER_PAGE]+index%ENTRIES_PER_PAGE;
+ return info->i_indirect[index]+offset;
}
static int shmem_free_swp(swp_entry_t *dir, unsigned int count)
@@ -99,7 +104,6 @@
* @dir: pointer to swp_entries
* @size: number of entries in dir
* @start: offset to start from
- * @inode: inode for statistics
* @freed: counter for freed pages
*
* It frees the swap entries from dir+start til dir+size
@@ -109,7 +113,7 @@
static unsigned long
shmem_truncate_part (swp_entry_t * dir, unsigned long size,
- unsigned long start, struct inode * inode, unsigned long *freed) {
+ unsigned long start, unsigned long *freed) {
if (start > size)
return start - size;
if (dir)
@@ -121,21 +125,27 @@
static void shmem_truncate (struct inode * inode)
{
int clear_base;
- unsigned long start;
+ unsigned long index, start;
unsigned long mmfreed, freed = 0;
- swp_entry_t **base, **ptr;
+ swp_entry_t **base, **ptr, **last;
struct shmem_inode_info * info = &inode->u.shmem_i;
spin_lock (&info->lock);
- start = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ if (index >= info->max_index) {
+ info->max_index = index;
+ spin_unlock (&info->lock);
+ return;
+ }
- start = shmem_truncate_part (info->i_direct, SHMEM_NR_DIRECT, start, inode, &freed);
+ start = shmem_truncate_part (info->i_direct, SHMEM_NR_DIRECT, index, &freed);
if (!(base = info->i_indirect))
- goto out;;
+ goto out;
clear_base = 1;
- for (ptr = base; ptr < base + ENTRIES_PER_PAGE; ptr++) {
+ last = base + ((info->max_index - SHMEM_NR_DIRECT + ENTRIES_PER_PAGE - 1) / ENTRIES_PER_PAGE);
+ for (ptr = base; ptr < last; ptr++) {
if (!start) {
if (!*ptr)
continue;
@@ -145,16 +155,16 @@
continue;
}
clear_base = 0;
- start = shmem_truncate_part (*ptr, ENTRIES_PER_PAGE, start, inode, &freed);
+ start = shmem_truncate_part (*ptr, ENTRIES_PER_PAGE, start, &freed);
}
- if (!clear_base)
- goto out;
-
- free_page ((unsigned long)base);
- info->i_indirect = 0;
+ if (clear_base) {
+ free_page ((unsigned long)base);
+ info->i_indirect = 0;
+ }
out:
+ info->max_index = index;
/*
* We have to calculate the free blocks since we do not know
@@ -181,14 +191,14 @@
{
struct shmem_sb_info *info = &inode->i_sb->u.shmem_sb;
- spin_lock (&shmem_ilock);
- list_del (&inode->u.shmem_i.list);
- spin_unlock (&shmem_ilock);
inode->i_size = 0;
shmem_truncate (inode);
spin_lock (&info->stat_lock);
info->free_inodes++;
spin_unlock (&info->stat_lock);
+ spin_lock (&shmem_ilock);
+ list_del (&inode->u.shmem_i.list);
+ spin_unlock (&shmem_ilock);
clear_inode(inode);
}
@@ -215,21 +225,23 @@
info = &((struct inode *)page->mapping->host)->u.shmem_i;
if (info->locked)
return 1;
+ spin_lock(&info->lock);
swap = __get_swap_page(2);
- if (!swap.val)
+ if (!swap.val) {
+ spin_unlock(&info->lock);
return 1;
+ }
- spin_lock(&info->lock);
- entry = shmem_swp_entry (info, page->index);
+ entry = shmem_swp_entry(info, page->index);
if (!entry) /* this had been allocted on page allocation */
BUG();
error = -EAGAIN;
if (entry->val) {
- __swap_free(swap, 2);
+ __swap_free(swap, 2);
goto out;
- }
+ }
- *entry = swap;
+ *entry = swap;
error = 0;
/* Remove the from the page cache */
lru_cache_del(page);
@@ -756,21 +768,21 @@
return -1;
}
-static int shmem_unuse_inode (struct inode *inode, swp_entry_t entry, struct page *page)
+static inline int shmem_unuse_inode (struct shmem_inode_info *info, struct address_space *mapping, swp_entry_t entry, struct page *page)
{
swp_entry_t **base, **ptr;
unsigned long idx;
int offset;
idx = 0;
- if ((offset = shmem_clear_swp (entry, inode->u.shmem_i.i_direct, SHMEM_NR_DIRECT)) >= 0)
+ if ((offset = shmem_clear_swp (entry,info->i_direct, SHMEM_NR_DIRECT)) >= 0)
goto found;
idx = SHMEM_NR_DIRECT;
- if (!(base = inode->u.shmem_i.i_indirect))
+ if (!(base = info->i_indirect))
return 0;
- for (ptr = base; ptr < base + ENTRIES_PER_PAGE; ptr++) {
+ for (ptr = base; idx <= info->max_index; ptr++) {
if (*ptr &&
(offset = shmem_clear_swp (entry, *ptr, ENTRIES_PER_PAGE)) >= 0)
goto found;
@@ -778,35 +790,71 @@
}
return 0;
found:
- delete_from_swap_cache (page);
- add_to_page_cache (page, inode->i_mapping, idx);
+ if (PageSwapCache(page))
+ BUG();
+ add_to_page_cache (page, mapping, offset + idx);
SetPageDirty (page);
SetPageUptodate (page);
UnlockPage (page);
- spin_lock (&inode->u.shmem_i.lock);
- inode->u.shmem_i.swapped--;
- spin_unlock (&inode->u.shmem_i.lock);
+ info->swapped--;
return 1;
}
+static struct inode * grab_next_inode(struct list_head **listp)
+{
+ struct inode *inode;
+
+ for (*listp = (*listp)->next; *listp != &shmem_inodes; *listp = (*listp)->next) {
+ inode = list_entry(*listp, struct inode, u.shmem_i.list);
+
+ if (igrab(inode))
+ return inode;
+ }
+
+ return NULL;
+}
+
/*
- * unuse_shmem() search for an eventually swapped out shmem page.
+ * shmem_unuse() search for an eventually swapped out shmem page.
*/
void shmem_unuse(swp_entry_t entry, struct page *page)
{
- struct list_head *p;
- struct inode * inode;
+ struct list_head *p = &shmem_inodes;
+ struct inode * inode, *next_inode = NULL;
+ struct shmem_inode_info *info;
+ int ret;
- spin_lock (&shmem_ilock);
- list_for_each(p, &shmem_inodes) {
- inode = list_entry(p, struct inode, u.shmem_i.list);
+ /*
+ * This is tricky:
+ *
+ * - We cannot use a semaphore for the list since delete will
+ * revert the lock order.
+ * - We need to get the inode semaphore since the swap cache
+ * has no atomic actions for getting an entry
+ * - We cannot drop the inode while holding the lock
+ *
+ * So we grab the current inode and the next one to make sure
+ * that the next pointer inthe list is valid.
+ */
- if (shmem_unuse_inode(inode, entry, page))
+ spin_lock(&shmem_ilock);
+ for(inode = grab_next_inode(&p); inode; inode = next_inode) {
+ next_inode = grab_next_inode(&p);
+ info = &inode->u.shmem_i;
+ spin_unlock(&shmem_ilock);
+ down(&inode->i_sem);
+ spin_lock (&info->lock);
+ ret = shmem_unuse_inode(info, inode->i_mapping, entry, page);
+ spin_unlock (&info->lock);
+ up(&inode->i_sem);
+ iput (inode);
+ spin_lock(&shmem_ilock);
+ if (ret)
break;
}
spin_unlock (&shmem_ilock);
+ iput(next_inode);
}
-
/*
* shmem_file_setup - get an unlinked file living in shmem fs
On 27 Dec 2000, Christoph Rohland wrote:
> BTW: The generic swapoff path itself has still races if a process is
> paging in a page which is just freed on swap by try_to_unuse. It gives
> 'VM: bad swap entries' and worse. But this is not shmem
> specific. Marcelo would you like to look into this?
Sure.
Look at this comment on try_to_unuse():
swap_device_lock(si);
for (i = 1; i < si->max ; i++) {
if (si->swap_map[i] > 0 && si->swap_map[i] != SWAP_MAP_BAD) {
/*
* Prevent swaphandle from being completely
* unused by swap_free while we are trying
* to read in the page - this prevents warning
* messages from rw_swap_page_base.
*/
if (si->swap_map[i] != SWAP_MAP_MAX)
si->swap_map[i]++;
swap_device_unlock(si);
goto found_entry;
...
I think that incrementing the swap entry count will not allow swap from
removing the swap entry (as the comment says)
What I'm missing here?
Thanks
Marcelo Tosatti <[email protected]> writes:
> I think that incrementing the swap entry count will not allow swap from
> removing the swap entry (as the comment says)
I think the culprit is somewhere else. The error occurs in nopage of a
process, not in swapoff.
Looking at the following in try_to_unuse:
found_entry:
entry = SWP_ENTRY(type, i);
/* Get a page for the entry, using the existing swap
cache page if there is one. Otherwise, get a clean
page and read the swap into it. */
page = read_swap_cache(entry);
if (!page) {
swap_free(entry);
return -ENOMEM;
}
if (PageSwapCache(page))
delete_from_swap_cache(page);
read_lock(&tasklist_lock);
for_each_task(p)
unuse_process(p->mm, entry, page);
read_unlock(&tasklist_lock);
shmem_unuse(entry, page);
and in unuse_pte:
if (pte_present(pte)) {
/* If this entry is swap-cached, then page must already
hold the right address for any copies in physical
memory */
if (pte_page(pte) != page)
return;
/* We will be removing the swap cache in a moment, so... */
ptep_mkdirty(dir);
return;
}
I see at least a wrong comment. There is no swap cache any more... And
another thought is: why can we remove the swap cache entry before
finding the process? What prevent reallocation of the swap cache entry
by a page fault?
BTW you can reproduce the problem on a UP system quite easily by
trashing and always disabling?nabling two swap partitions round robin.
Greetings
Christoph
On 27 Dec 2000, Christoph Rohland wrote:
> Marcelo Tosatti <[email protected]> writes:
>
> > I think that incrementing the swap entry count will not allow swap from
> > removing the swap entry (as the comment says)
>
> I think the culprit is somewhere else. The error occurs in nopage of a
> process, not in swapoff.
I think swapoff() should be fixed..
I moved the
if (PageSwapCache(page))
delete_from_swap_cache(page);
thing to _before_ the "unuse_process()" and "shmem_unuse()" code, because
I wanted to avoid a race on the PageDirty bit that way. However, that
opens up another race, the one you see with "nopage".
Woul dyou mind testing this alternate fix instead:
- add the lines
repeat:
ClearPageDirty(page);
to just before the "read_lock(&tasklist_lock);" in try_to_unuse(). We
can obviously mark the page clean, because we _are_ going to get rid of
it, and in the meantime we have a hold on it by virtue of having raised
the page count in "read_swap_cache()".
- move the "delete_from_swap_cache()" call back to _after_ the
unuse() calls.
- but just before deleting the entry, we add a new test:
if (PageDirty(page))
goto repeat;
this all should mean that if something moves the dirty bit from a page
table to the backing store, we will notice, and just re-do the VM scan,
which will mark the page table entry dirty again. And because we delete it
from the swap cache late, we aren't severing the link with
"nopage" handling.
Christoph, how does this sound to you?
Linus
Linus Torvalds <[email protected]> writes:
> On 27 Dec 2000, Christoph Rohland wrote:
> Woul dyou mind testing this alternate fix instead:
Does not work, but is the right direction I think.
First we need the following patch since otherwise we use a swap entry
without having the count increased:
--- 4-13-4/mm/vmscan.c Fri Dec 22 10:05:38 2000
+++ m4-13-4/mm/vmscan.c Thu Dec 28 11:57:57 2000
@@ -93,8 +93,8 @@
entry.val = page->index;
if (pte_dirty(pte))
SetPageDirty(page);
-set_swap_pte:
swap_duplicate(entry);
+set_swap_pte:
set_pte(page_table, swp_entry_to_pte(entry));
drop_pte:
UnlockPage(page);
@@ -185,7 +185,7 @@
* we have the swap cache set up to associate the
* page with that swap entry.
*/
- entry = get_swap_page();
+ entry = __get_swap_page(2);
if (!entry.val)
goto out_unlock_restore; /* No swap space left */
Second there look at this in handle_pte_fault:
/*
* If it truly wasn't present, we know that kswapd
* and the PTE updates will not touch it later. So
* drop the lock.
*/
spin_unlock(&mm->page_table_lock);
if (pte_none(entry))
return do_no_page(mm, vma, address, write_access, pte);
return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access);
The comment is wrong. try_to_unuse will touch it. This stumbles over a
bad swap entry after try_to_unuse complaining about an undead swap
entry.
If I retry in try_to_unuse it goes into an infinite loop since it
deadlocks with this.
Ideas?
Christoph
On 28 Dec 2000, Christoph Rohland wrote:
>
> First we need the following patch since otherwise we use a swap entry
> without having the count increased:
No, that shouldn't be needed.
Look at the code-path: the kernel has the page locked, so nothing will
de-allocate the swap entry - so it's perfectly ok to increase it later. I
dislike the "magic" __get_swap_page(2) thing - we might make
get_swap_page() itself _always_ return a swap entry with count two (one
fot eh swap cache, one for the user), or we should keep it the way it was
(where we explicitly increment it for the user).
> Second there look at this in handle_pte_fault:
>
> /*
> * If it truly wasn't present, we know that kswapd
> * and the PTE updates will not touch it later. So
> * drop the lock.
> */
> spin_unlock(&mm->page_table_lock);
> if (pte_none(entry))
> return do_no_page(mm, vma, address, write_access, pte);
> return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access);
>
> The comment is wrong. try_to_unuse will touch it. This stumbles over a
> bad swap entry after try_to_unuse complaining about an undead swap
> entry.
>
> If I retry in try_to_unuse it goes into an infinite loop since it
> deadlocks with this.
Ok. How about making try_to_unuse() just get the VM semaphore instead of
the page table lock?
Then try_to_unuse() would follow all the right rules, and the above
problem wouldn't exist..
Linus
Linus Torvalds <[email protected]> writes:
> On 28 Dec 2000, Christoph Rohland wrote:
> >
> > First we need the following patch since otherwise we use a swap entry
> > without having the count increased:
>
> No, that shouldn't be needed.
>
> Look at the code-path: the kernel has the page locked, so nothing will
> de-allocate the swap entry - so it's perfectly ok to increase it
> later.
I am not sure that page locking is done very strictly in the swap
cache. I had to fiddle around that sometimes in shmem.c.
The patch actually is getting me the 'right error': Undead swap
entries in swapoff instead of bad swap entries in nopage. The latter
means there is a swapentry noted in the page table which was legally
removed. And that's definitely wrong.
> I dislike the "magic" __get_swap_page(2) thing - we might make
> get_swap_page() itself _always_ return a swap entry with count two
> (one fot eh swap cache, one for the user), or we should keep it the
> way it was (where we explicitly increment it for the user).
Yes, I agree. I was too lazy to check the other uses of get_swap_page
for this patchlet. But at least shmem.c uses the same and I think it's
logical. We always need a swap cache and a user reference.
> Ok. How about making try_to_unuse() just get the VM semaphore instead of
> the page table lock?
>
> Then try_to_unuse() would follow all the right rules, and the above
> problem wouldn't exist..
If this is right we should do this. There is no need to care about
contention in swapoff. It's definitely not the common path. But we
have to be careful about deadlocks....
Greetings
Christoph