From: =?UTF-8?q?Rados=C5=82aw=20Smogura?= Subject: [PATCH 10/18] Support for huge page faulting Date: Thu, 16 Feb 2012 15:31:37 +0100 Message-ID: <1329402705-25454-10-git-send-email-mail@smogura.eu> References: <1329402705-25454-1-git-send-email-mail@smogura.eu> Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: quoted-printable Cc: Yongqiang Yang , mail@smogura.eu, linux-ext4@vger.kernel.org To: linux-mm@kvack.org Return-path: In-Reply-To: <1329402705-25454-1-git-send-email-mail@smogura.eu> Sender: owner-linux-mm@kvack.org List-Id: linux-ext4.vger.kernel.org Adds some basic vm routines and macros to operate on huge page cache, designed to proper faulting of huge pages. 1. __do_fault - made it common for huge and small. 2. Simple wrappers for huge pages for rmapping. 3. Other changes. Signed-off-by: Rados=C5=82aw Smogura --- include/linux/defrag-pagecache.h | 18 +-- include/linux/fs.h | 19 +- include/linux/mm.h | 28 ++ include/linux/mm_types.h | 2 +- include/linux/rmap.h | 9 + mm/huge_memory.c | 42 +++ mm/memory.c | 528 +++++++++++++++++++++++++++++++-= ------ mm/page-writeback.c | 31 +++ mm/rmap.c | 29 ++ 9 files changed, 582 insertions(+), 124 deletions(-) diff --git a/include/linux/defrag-pagecache.h b/include/linux/defrag-page= cache.h index 46793de..4ca3468 100644 --- a/include/linux/defrag-pagecache.h +++ b/include/linux/defrag-pagecache.h @@ -8,7 +8,7 @@ =20 #ifndef DEFRAG_PAGECACHE_H #define DEFRAG_PAGECACHE_H -#include +#include =20 /* XXX Split this file into two public and protected - comments below * Protected will contain @@ -24,22 +24,6 @@ typedef struct page *defrag_generic_get_page( const struct defrag_pagecache_ctl *ctl, struct inode *inode, pgoff_t pageIndex); =20 -/** Passes additional information and controls to page defragmentation. = */ -struct defrag_pagecache_ctl { - /** If yes defragmentation will try to fill page caches. */ - char fillPages:1; - - /** If filling of page fails, defragmentation will fail too. Setting - * this requires {@link #fillPages} will be setted. - */ - char requireFillPages:1; - - /** If yes defragmentation will try to force in many aspects, this may - * cause, operation to run longer, but with greater probability of - * success. */ - char force:1; -}; - /** Defragments page cache of specified file and migrates it's to huge p= ages. * * @param f diff --git a/include/linux/fs.h b/include/linux/fs.h index bfd9122..7288166 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -10,10 +10,7 @@ #include #include #include - -#ifdef CONFIG_HUGEPAGECACHE -#include -#endif +#include =20 /* * It's silly to have NR_OPEN bigger than NR_FILE, but you can change @@ -596,6 +593,9 @@ struct address_space_operations { /* Set a page dirty. Return true if this dirtied it */ int (*set_page_dirty)(struct page *page); =20 + /** Same as \a set_page_dirty but for huge page */ + int (*set_page_dirty_huge)(struct page *page); +=09 int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); =20 @@ -606,7 +606,6 @@ struct address_space_operations { loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); =20 -#ifdef CONFIG_HUGEPAGECACHE /** Used to defrag (migrate) pages at position {@code pos} * to huge pages. Having this not {@code NULL} will indicate that * address space, generally, supports huge pages (transaprent @@ -616,15 +615,19 @@ struct address_space_operations { * * @param pagep on success will be setted to established huge page * - * @returns TODO What to return? - * {@code 0} on success, value less then {@code 0} on error + * @returns {@code 0} on success, value less then {@code 0} on error */ int (*defragpage) (struct file *, struct address_space *mapping, loff_t pos, struct page **pagep, const struct defrag_pagecache_ctl *ctl); -#endif =20 + /** Used to split page, this method may be called under memory + * preasure. Actaully, You should not split page. + */ + int (*split_page) (struct file *file, struct address_space *mapping, + loff_t pos, struct page *hueg_page); +=09 /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); void (*invalidatepage) (struct page *, unsigned long); diff --git a/include/linux/mm.h b/include/linux/mm.h index 72f6a50..27a10c8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -206,10 +206,19 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); =20 + /** Same as \a fault but should return huge page, instead of single one= . + * If function fails, then caller may try again with fault. + */ + int (*fault_huge)(struct vm_area_struct *vma, struct vm_fault *vmf); +=09 /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); =20 + /** Same as \a page_mkwrite, but for huge page. */ + int (*page_mkwrite_huge)(struct vm_area_struct *vma, + struct vm_fault *vmf); +=09 /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs that can switch between memory and hardware */ @@ -534,6 +543,16 @@ static inline void get_page(struct page *page) } } =20 +/** Bumps tail pages usage count. If there is at least one page that do = not have + * valid mapping page count is left untoach. + */ +extern void get_page_tails_for_fmap(struct page *head); + +/** Decrease tail pages usage count. + * This function assumes you have getted compound or forozen compound. + */ +extern void put_page_tails_for_fmap(struct page *head); + static inline void get_huge_page_tail(struct page *page) { /* @@ -996,6 +1015,7 @@ static inline int page_mapped(struct page *page) #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return = page */ #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ +#define VM_FAULT_NOHUGE 0x0800 /* ->fault_huge, no huge page available = .*/ =20 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for l= arge hwpoison */ =20 @@ -1161,6 +1181,14 @@ int redirty_page_for_writepage(struct writeback_co= ntrol *wbc, void account_page_dirtied(struct page *page, struct address_space *mappi= ng); void account_page_writeback(struct page *page); int set_page_dirty(struct page *page); + +/** Sets huge page dirty, this will lock all tails, head should be locke= d. + * Compound should be getted or frozen. Skips all pages that have no map= ping + * + * @param head + * @return number of sucessfull set_page_dirty + */ +int set_page_dirty_huge(struct page *page); int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); =20 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7649722..7d2c09d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -296,7 +296,7 @@ struct vm_area_struct { =20 /* Function pointers to deal with this struct. */ const struct vm_operations_struct *vm_ops; - +=09 /* Information about our backing store: */ unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE units, *not* PAGE_CACHE_SIZE */ diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 1cdd62a..bc547cb 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -142,8 +142,17 @@ void do_page_add_anon_rmap(struct page *, struct vm_= area_struct *, unsigned long, int); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsi= gned long); void page_add_file_rmap(struct page *); + +/** Adds remap for huge page, compound page must be getted or frozen. + */ +extern void page_add_file_rmap_huge(struct page *head); + void page_remove_rmap(struct page *); =20 +/** Removes rmap for huge page, compound page must be getted or frozen. + */ +void page_remove_rmap_huge(struct page *); + void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e3b4c38..74d2e84 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2455,3 +2455,45 @@ void __vma_adjust_trans_huge(struct vm_area_struct= *vma, split_huge_page_address(next->vm_mm, nstart); } } + +/** Bumps tail pages usage count. This function assumes you have getted = compound + * or forozen compound. + */ +void get_page_tails_for_fmap(struct page *head) +{ + struct page *page; + + VM_BUG_ON(!PageHead(head)); + VM_BUG_ON(atomic_read(&head[2]._compound_usage) =3D=3D 1); + VM_BUG_ON(compound_order(head) < 2); + + get_page(head + 1); + /* We may use __first_page, because we getts compound at whole. */ + for (page =3D head + 2; page->__first_page =3D=3D head; page++) { + VM_BUG_ON(!atomic_read(&page->_count)); + VM_BUG_ON(!page->mapping); + VM_BUG_ON(!PageTail(page)); + get_page(page); + } +} + +/** Decrease tail pages usage count. + * This function assumes you have getted compound or forozen compound. + */ +void put_page_tails_for_fmap(struct page *head) +{ + struct page *page; + + VM_BUG_ON(!PageHead(head)); + VM_BUG_ON(atomic_read(&head[2]._compound_usage) =3D=3D 1); + VM_BUG_ON(compound_order(head) < 2); + + put_page(head + 1); + /* We may use __first_page, because we getts compound at whole. */ + for (page =3D head + 2; page->__first_page =3D=3D head; page++) { + VM_BUG_ON(!atomic_read(&page->_count)); + VM_BUG_ON(!page->mapping); + VM_BUG_ON(!PageTail(page)); + put_page(page); + } +} diff --git a/mm/memory.c b/mm/memory.c index a0ab73c..7427c9b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3148,7 +3148,137 @@ oom: return VM_FAULT_OOM; } =20 -/* +/** Level 0 check if it's possible to establish huge pmd in process addr= ess + * space. + */ +static int check_if_hugemapping_is_possible0( + struct vm_area_struct *vma, + unsigned long address, + pgoff_t pgoff, + pmd_t pmdVal /* Keep pmd for THP for Pivate Mapping. */) +{ + if (vma->vm_ops) { + /* This is base chcek. */ + if (!vma->vm_ops->fault_huge) + return 0; + } else { + return 0; + } + + if (vma->vm_flags & VM_SHARED && !(vma->vm_flags & VM_NONLINEAR)) { + /* Check if VMA address is pmd aligned */ + if ((address & ~PMD_MASK) !=3D 0) + return 0; + + /* Check if pgoff is huge page aligned */ + /* XXX This should be exported as it's reused in defrag. */ + if ((pgoff & ((1 << (PMD_SHIFT - PAGE_SHIFT)) - 1)) !=3D 0) + return 0; + + /* Check if huge pmd will fit inside VMA. + * pmd_address_end returns first byte after end, not last byte! + */ + if (!(pmd_addr_end(address, (unsigned long) -1) <=3D vma->vm_end)) + return 0; + + /* WIP [Private THP], check if pmd is marked as do not make THP, + * e.g. because it has COWs. (COWs gives milk). + * We need add such flag because + */ + + /* Check if file has enaugh length - not needed if there is + * huge page in page cache, this implies file has enaugh lenght. + * TODO Think on above. If true make requirement for THP support + * in page cache (put in documentation). + * This may break some concepts that page cache may have not + * up to date huge page, too. + */ + } else { + /* Anonymous VMA - not opcoded, yet. */ + return 0; + } + + /* All tests passed */ + printk(KERN_INFO "Chk - All passed"); + return 1; +} + + +/** Commons function for performing faulting with support for huge pages= . + * This method is designed to be facade-ed, by others. + * + * TODO Still need to consider locking order, to prevent dead locks... + * it's looks like better will be compound_lock -> page_lock + * + * @param page loaded head page, locked iff compound_lock, getted + * + * @return {@code 0} on success + */ +static /*inline*/ int __huge_lock_check( + struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + pud_t *pud, + pmd_t pmd, + pgoff_t pgoff, + unsigned int flags, + struct page *head) +{ + struct page *workPage; + unsigned long workAddress; + unsigned int processedPages; + + int result =3D 0; + + VM_BUG_ON(!check_if_hugemapping_is_possible0(vma, address, pgoff, + pmd)); + VM_BUG_ON(atomic_read(&head->_count) <=3D 2); + VM_BUG_ON(!PageHead(head)); + + /* TODO [Documentation] expose below rules, from code. + * + * XXX Is it possible to with tests in loop to map not uptodate pages? + * + * It's looks like that with following designe we require that removing + * page uptodate flag, for compound pages, may require compound lock + * or something else. + */ + + /* Check if tail pages are uptodate, this should not happen, + * as we have compound_lock, but I can't guarantee and linear ordered. + */ + processedPages =3D 0; + workAddress =3D address; + /** XXX [Performance] compound_head is rather slow make new macro, when + * we have compound page getted. + */ + for (workPage =3D head; compound_head(workPage) =3D=3D head; workPage++= ) { + if (!PageUptodate(workPage) + || !workPage->mapping + || (workPage->index - processedPages !=3D pgoff)) { + result =3D -EINVAL; + goto exit_processing; + } + /* We don't check ptes, because we have shared mapping + * so all ptes should be (or could be in future) same, meaning + * mainly protection flags. This check will be required for + * private mapping. + */ + processedPages++; + workAddress +=3D PAGE_SIZE; + } + if (processedPages !=3D (1 << (PMD_SHIFT - PAGE_SHIFT))) { + /* Not enaugh processed pages, why? */ + return processedPages + 1; + } + +exit_processing: + printk("Processed %d", processedPages); + + return result; +} + +/** * __do_fault() tries to create a new page mapping. It aggressively * tries to share with existing pages, but makes a separate copy if * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid @@ -3160,28 +3290,45 @@ oom: * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte neither mapped nor locked. * We return with mmap_sem still held, but pte unmapped and unlocked. + * + * This method shares same concepts for single and huge pages. + * + * @param pud pud entry, if NULL method operates in single page mode, ot= herwise + * operates in huge page mode. */ -static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +static inline int __do_fault(struct mm_struct *mm, struct vm_area_struct= *vma, + unsigned long address, pud_t *pud, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, + pmd_t orig_pmd, pte_t orig_pte) { pte_t *page_table; + pmd_t *huge_table; + + pte_t entry; + pmd_t hentry; + spinlock_t *ptl; struct page *page; struct page *cow_page; - pte_t entry; + int anon =3D 0; struct page *dirty_page =3D NULL; struct vm_fault vmf; + const struct vm_operations_struct *vm_ops =3D vma->vm_ops; int ret; int page_mkwrite =3D 0; =20 + VM_BUG_ON((!!pmd) =3D=3D (!!pud)); + /* * If we do COW later, allocate page befor taking lock_page() * on the file cache page. This will reduce lock holding time. */ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { - + if (pud) { + /* Privte mapping write not supported yet. */ + BUG(); + } if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; =20 @@ -3196,14 +3343,20 @@ static int __do_fault(struct mm_struct *mm, struc= t vm_area_struct *vma, } else cow_page =3D NULL; =20 - vmf.virtual_address =3D (void __user *)(address & PAGE_MASK); + vmf.virtual_address =3D (void __user *) + (address & (pud ? HPAGE_MASK : PAGE_MASK)); vmf.pgoff =3D pgoff; vmf.flags =3D flags; vmf.page =3D NULL; =20 - ret =3D vma->vm_ops->fault(vma, &vmf); + /** XXX Tails should be getted to. */ + if (pud) + ret =3D vm_ops->fault_huge(vma, &vmf); + else + ret =3D vm_ops->fault(vma, &vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | - VM_FAULT_RETRY))) + VM_FAULT_RETRY | VM_FAULT_NOHUGE))) goto uncharge_out; =20 if (unlikely(PageHWPoison(vmf.page))) { @@ -3213,21 +3366,36 @@ static int __do_fault(struct mm_struct *mm, struc= t vm_area_struct *vma, goto uncharge_out; } =20 - /* - * For consistency in subsequent calls, make the faulted page always - * locked. + /* For consistency in subsequent calls, make the faulted page + * always locked. */ if (unlikely(!(ret & VM_FAULT_LOCKED))) - lock_page(vmf.page); + lock_page(vmf.page); else VM_BUG_ON(!PageLocked(vmf.page)); =20 + page =3D vmf.page; + if (pud) { + /* Check consystency of page, if it is applicable for huge + * mapping. + */ + if (__huge_lock_check(mm, vma, address, pud, orig_pmd, pgoff, + flags, vmf.page)) { + unlock_page(page); + goto unwritable_page; + } + } + /* * Should we do an early C-O-W break? */ - page =3D vmf.page; if (flags & FAULT_FLAG_WRITE) { if (!(vma->vm_flags & VM_SHARED)) { + if (pud) { + /* Private cowing not supported yet for huge. */ + BUG(); + } + page =3D cow_page; anon =3D 1; copy_user_highpage(page, vmf.page, address, vma); @@ -3238,89 +3406,156 @@ static int __do_fault(struct mm_struct *mm, stru= ct vm_area_struct *vma, * address space wants to know that the page is about * to become writable */ - if (vma->vm_ops->page_mkwrite) { + if ((!pud && vm_ops->page_mkwrite) || + (pud && vm_ops->page_mkwrite_huge)) { int tmp; - unlock_page(page); vmf.flags =3D FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - tmp =3D vma->vm_ops->page_mkwrite(vma, &vmf); + tmp =3D vm_ops->page_mkwrite(vma, &vmf); if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { ret =3D tmp; goto unwritable_page; } if (unlikely(!(tmp & VM_FAULT_LOCKED))) { + if (pud) + BUG(); lock_page(page); if (!page->mapping) { ret =3D 0; /* retry the fault */ - unlock_page(page); goto unwritable_page; } } else VM_BUG_ON(!PageLocked(page)); - page_mkwrite =3D 1; + page_mkwrite =3D 1 << (PMD_SHIFT - PAGE_SHIFT); } } =20 } =20 - page_table =3D pte_offset_map_lock(mm, pmd, address, &ptl); - - /* - * This silly early PAGE_DIRTY setting removes a race - * due to the bad i386 page protection. But it's valid - * for other architectures too. - * - * Note that if FAULT_FLAG_WRITE is set, we either now have - * an exclusive copy of the page, or this is a shared mapping, - * so we can make it writable and dirty to avoid having to - * handle that later. + /* Following if is almost same for pud and not pud just, specified + * methods changed. Keep it as far as possi ble synchronized */ - /* Only go through if we didn't race with anybody else... */ - if (likely(pte_same(*page_table, orig_pte))) { - flush_icache_page(vma, page); - entry =3D mk_pte(page, vma->vm_page_prot); - if (flags & FAULT_FLAG_WRITE) - entry =3D maybe_mkwrite(pte_mkdirty(entry), vma); - if (anon) { - inc_mm_counter_fast(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address); - } else { - inc_mm_counter_fast(mm, MM_FILEPAGES); - page_add_file_rmap(page); + if (pud) { + huge_table =3D pmd_offset(pud, address); + /* During allocation of pte pte_alloc uses, mm's page table lock + * it is not best solution, but we reuse it here. + */ + ptl =3D &mm->page_table_lock; + spin_lock(ptl); + if (likely(pmd_same(*huge_table, orig_pmd))) { + flush_icache_page(vma, page);/* TODO Arch specific? */ + hentry =3D mk_pmd(page, vma->vm_page_prot); + hentry =3D pmd_mkhuge(hentry); + if (flags & FAULT_FLAG_WRITE) { - dirty_page =3D page; - get_page(dirty_page); + hentry =3D pmd_mkdirty(hentry); + /* TODO make it pmd_maybe_mkwrite*/ + if (likely(vma->vm_flags & VM_WRITE)) + hentry =3D pmd_mkwrite(hentry); } - } - set_pte_at(mm, address, page_table, entry); + if (anon) { + BUG(); + inc_mm_counter_fast(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, address); + } else { + /* TODO Inc of huge pages counter...*/ + add_mm_counter_fast(mm, MM_FILEPAGES, + HPAGE_PMD_NR); + page_add_file_rmap_huge(page); + if (flags & FAULT_FLAG_WRITE) { + dirty_page =3D page; + get_page(dirty_page); + get_page_tails_for_fmap(dirty_page); + } + } + set_pmd_at(mm, address, huge_table, hentry); =20 - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, address, page_table); + /* no need to invalidate: a not-present page won't be + * cached */ + update_mmu_cache(vma, address, page_table); + } else { + if (cow_page) + mem_cgroup_uncharge_page(cow_page); + if (anon) + page_cache_release(page); + else + anon =3D 1; /* no anon but release faulted_page */ + } + spin_unlock(ptl); } else { - if (cow_page) - mem_cgroup_uncharge_page(cow_page); - if (anon) - page_cache_release(page); - else - anon =3D 1; /* no anon but release faulted_page */ - } + page_table =3D pte_offset_map_lock(mm, pmd, address, &ptl); + /* + * This silly early PAGE_DIRTY setting removes a race + * due to the bad i386 page protection. But it's valid + * for other architectures too. + * + * Note that if FAULT_FLAG_WRITE is set, we either now have + * an exclusive copy of the page, or this is a shared mapping, + * so we can make it writable and dirty to avoid having to + * handle that later. + */ + /* Only go through if we didn't race with anybody else... */ + if (likely(pte_same(*page_table, orig_pte))) { + flush_icache_page(vma, page); + entry =3D mk_pte(page, vma->vm_page_prot); + if (flags & FAULT_FLAG_WRITE) + entry =3D maybe_mkwrite(pte_mkdirty(entry), vma); + if (anon) { + inc_mm_counter_fast(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, address); + } else { + inc_mm_counter_fast(mm, MM_FILEPAGES); + page_add_file_rmap(page); + if (flags & FAULT_FLAG_WRITE) { + dirty_page =3D page; + get_page(dirty_page); + } + } + set_pte_at(mm, address, page_table, entry); =20 - pte_unmap_unlock(page_table, ptl); + /* no need to invalidate: a not-present page won't be + * cached */ + update_mmu_cache(vma, address, page_table); + } else { + if (cow_page) + mem_cgroup_uncharge_page(cow_page); + if (anon) + page_cache_release(page); + else + anon =3D 1; /* no anon but release faulted_page */ + } + pte_unmap_unlock(page_table, ptl); + } =20 if (dirty_page) { struct address_space *mapping =3D page->mapping; =20 - if (set_page_dirty(dirty_page)) - page_mkwrite =3D 1; - unlock_page(dirty_page); + if (pud) { + int dirtied; + dirtied =3D set_page_dirty_huge(dirty_page); + unlock_page(dirty_page); + if (dirtied) + page_mkwrite =3D dirtied; + } else { + if (set_page_dirty(dirty_page)) + page_mkwrite =3D 1; + unlock_page(dirty_page); + } + + if (pud) { + put_page_tails_for_fmap(dirty_page); + compound_put(page); + } + put_page(dirty_page); if (page_mkwrite && mapping) { /* * Some device drivers do not set page.mapping but still * dirty their pages */ - balance_dirty_pages_ratelimited(mapping); + balance_dirty_pages_ratelimited_nr(mapping, + page_mkwrite); } =20 /* file_update_time outside page_lock */ @@ -3328,6 +3563,8 @@ static int __do_fault(struct mm_struct *mm, struct = vm_area_struct *vma, file_update_time(vma->vm_file); } else { unlock_page(vmf.page); + if (pud) + compound_put(page); if (anon) page_cache_release(vmf.page); } @@ -3335,6 +3572,10 @@ static int __do_fault(struct mm_struct *mm, struct= vm_area_struct *vma, return ret; =20 unwritable_page: + if (pud) { + compound_put(page); + put_page_tails_for_fmap(page); + } page_cache_release(page); return ret; uncharge_out: @@ -3346,6 +3587,33 @@ uncharge_out: return ret; } =20 +/** Facade for {@link __do_fault} to fault "huge" pages. + * GCC will strip unneeded code basing on parameters passed. + */ +static int __do_fault_huge(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, pud_t *pud, + pgoff_t pgoff, unsigned int flags, + pmd_t orig_pmd) +{ + pte_t pte_any; + return __do_fault( + mm, vma, address, pud, NULL, pgoff, flags, orig_pmd, pte_any); +} + +/** Facade for {@link __do_fault} to fault "normal", pte level pages. + * GCC will strip unneeded code basing on parameters passed. + */ +static int __do_fault_normal(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + pmd_t pmd_any; + return __do_fault( + mm, vma, address, NULL, pmd, pgoff, flags, pmd_any, orig_pte); +} + static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *= vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags, pte_t orig_pte) @@ -3354,7 +3622,7 @@ static int do_linear_fault(struct mm_struct *mm, st= ruct vm_area_struct *vma, - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; =20 pte_unmap(page_table); - return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + return __do_fault_normal(mm, vma, address, pmd, pgoff, flags, orig_pte)= ; } =20 /* @@ -3386,7 +3654,7 @@ static int do_nonlinear_fault(struct mm_struct *mm,= struct vm_area_struct *vma, } =20 pgoff =3D pte_to_pgoff(orig_pte); - return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + return __do_fault_normal(mm, vma, address, pmd, pgoff, flags, orig_pte)= ; } =20 /* @@ -3455,6 +3723,105 @@ unlock: return 0; } =20 +/** Handles fault on pde level.*/ +int handle_pmd_fault(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pud_t *pud, pmd_t *pmd, unsigned int flags) +{ + pte_t *pte; + pgoff_t pgoff; + pmd_t pmdVal; + int faultResult; + + if (!vma->vm_file) { + /* Anonymous THP handling */ + if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { + if (!vma->vm_ops) { + return do_huge_pmd_anonymous_page(mm, vma, + address, pmd, flags); + } + } else { + pmd_t orig_pmd =3D *pmd; + barrier(); + if (pmd_trans_huge(orig_pmd)) { + if (flags & FAULT_FLAG_WRITE && + !pmd_write(orig_pmd) && + !pmd_trans_splitting(orig_pmd)) + return do_huge_pmd_wp_page(mm, vma, + address, pmd, orig_pmd); + return 0; + } + goto handle_pte_level; + } + } + /*************************** + * Page cache THP handling * + ***************************/ + pmdVal =3D *pmd; + if (pmd_present(pmdVal) && !pmd_trans_huge(pmdVal)) + goto handle_pte_level; + + if ((address & HPAGE_MASK) < vma->vm_start) + goto handle_pte_level; + + /* Even if possible we currently support only for SHARED VMA. + * + * We support this only for shmem fs, but everyone is encorege + * to add few simple methods and test it for other file systems. + * Notes, warrnings etc are always welcome. + */ + if (!(vma->vm_flags & VM_SHARED)) + goto handle_pte_level; + + /* Handle fault of possible vma with huge page. */ + pgoff =3D (((address & HPAGE_MASK) - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; + + if (!pmd_present(pmdVal)) { + /* No page at all. */ + if (!check_if_hugemapping_is_possible0(vma, address, pgoff, + pmdVal)) + goto handle_pte_level; + } else { + /* TODO Jump to make page writable. If not for regular + * filesystems, full fault path will be reused. + */ + } + + faultResult =3D __do_fault_huge(mm, vma, address, pud, pgoff, flags, + pmdVal); + if (!(faultResult & (VM_FAULT_ERROR | VM_FAULT_NOHUGE))) { + printk(KERN_INFO "Setted huge pmd"); + return faultResult; + } + +handle_pte_level: + /* + * Use __pte_alloc instead of pte_alloc_map, because we can't + * run pte_offset_map on the pmd, if an huge pmd could + * materialize from under us from a different thread. + */ + if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) + return VM_FAULT_OOM; + /* Page cache THP uses mm->page_table_lock to check if pmd is still + * none just before setting ne huge pmd, is __pte_alloc suceeded + * then pmd may be huge or "normal" with ptes page. + * + * if an huge pmd materialized from under us just retry later */ + if (unlikely(pmd_trans_huge(*pmd))) + return 0; + + /* + * A regular pmd is established and it can't morph into a huge pmd + * from under us anymore at this point because we hold the mmap_sem + * read mode and khugepaged takes it in write mode. So now it's + * safe to run pte_offset_map(). + */ + pte =3D pte_offset_map(pmd, address); + + return handle_pte_fault(mm, vma, address, pte, pmd, flags); +} + /* * By the time we get here, we already hold the mm semaphore */ @@ -3464,7 +3831,6 @@ int handle_mm_fault(struct mm_struct *mm, struct vm= _area_struct *vma, pgd_t *pgd; pud_t *pud; pmd_t *pmd; - pte_t *pte; =20 __set_current_state(TASK_RUNNING); =20 @@ -3484,42 +3850,8 @@ int handle_mm_fault(struct mm_struct *mm, struct v= m_area_struct *vma, pmd =3D pmd_alloc(mm, pud, address); if (!pmd) return VM_FAULT_OOM; - if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { - if (!vma->vm_ops) - return do_huge_pmd_anonymous_page(mm, vma, address, - pmd, flags); - } else { - pmd_t orig_pmd =3D *pmd; - barrier(); - if (pmd_trans_huge(orig_pmd)) { - if (flags & FAULT_FLAG_WRITE && - !pmd_write(orig_pmd) && - !pmd_trans_splitting(orig_pmd)) - return do_huge_pmd_wp_page(mm, vma, address, - pmd, orig_pmd); - return 0; - } - } =20 - /* - * Use __pte_alloc instead of pte_alloc_map, because we can't - * run pte_offset_map on the pmd, if an huge pmd could - * materialize from under us from a different thread. - */ - if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) - return VM_FAULT_OOM; - /* if an huge pmd materialized from under us just retry later */ - if (unlikely(pmd_trans_huge(*pmd))) - return 0; - /* - * A regular pmd is established and it can't morph into a huge pmd - * from under us anymore at this point because we hold the mmap_sem - * read mode and khugepaged takes it in write mode. So now it's - * safe to run pte_offset_map(). - */ - pte =3D pte_offset_map(pmd, address); - - return handle_pte_fault(mm, vma, address, pte, pmd, flags); + return handle_pmd_fault(mm, vma, address, pud, pmd, flags); } =20 #ifndef __PAGETABLE_PUD_FOLDED diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 363ba70..ff32b5d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2072,6 +2072,37 @@ int set_page_dirty(struct page *page) } EXPORT_SYMBOL(set_page_dirty); =20 +int set_page_dirty_huge(struct page *head) +{ + struct page *work; + int result =3D 0; + + VM_BUG_ON(!PageHead(head)); + VM_BUG_ON(!PageLocked(head)); + VM_BUG_ON(atomic_read(&head[2]._compound_usage) =3D=3D 1); + + if (head->mapping) + result +=3D set_page_dirty(head); + else + BUG_ON(!PageSplitDeque(head)); + + for (work =3D head+1; compound_head(work) =3D=3D head; work++) { + VM_BUG_ON(page_has_private(work)); + VM_BUG_ON(page_has_buffers(work)); + + lock_page(work); + if (work->mapping) { + result +=3D set_page_dirty(work); + } else { + /* Bug if there is no mapping and split is not + * dequeued. + */ + BUG_ON(!PageSplitDeque(head)); + } + unlock_page(work); + } + return result; +} /* * set_page_dirty() is racy if the caller has no reference against * page->mapping->host, and if the page is unlocked. This is because an= other diff --git a/mm/rmap.c b/mm/rmap.c index c8454e0..11f54e0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1157,6 +1157,21 @@ void page_add_file_rmap(struct page *page) } } =20 +void page_add_file_rmap_huge(struct page *head) +{ + struct page *page; + + VM_BUG_ON(!PageHead(head)); + VM_BUG_ON(atomic_read(&head[2]._compound_usage) =3D=3D 1); + + page_add_file_rmap(head); + page_add_file_rmap(head + 1); + if (likely(compound_order(head) > 1)) { + for (page =3D head+2; page->__first_page =3D=3D head; page++) + page_add_file_rmap(page); + } +} + /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from @@ -1207,6 +1222,20 @@ void page_remove_rmap(struct page *page) */ } =20 +void page_remove_rmap_huge(struct page *head) +{ + struct page *page; + + VM_BUG_ON(!PageHead(head)); + VM_BUG_ON(atomic_read(&head[2]._compound_usage) =3D=3D 1); + + page_remove_rmap(head); + page_remove_rmap(head + 1); + if (likely(compound_order(head) > 1)) { + for (page =3D head+2; page->__first_page =3D=3D head; page++) + page_remove_rmap(page); + } +} /* * Subfunctions of try_to_unmap: try_to_unmap_one called * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_f= ile. --=20 1.7.3.4 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter= .ca/ Don't email: email@kvack.org